summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-07-10 23:18:26 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-07-10 23:18:26 +0000
commitc7c4310f7fc1485925d800628bf50b3aeab535ef (patch)
treeb12aa4be0e8fb82aaaea97fb475e793e8a347c49 /net
parent1ffd1d069ca4c5ffe16fea6175dab1b9bbb15820 (diff)
Merge with Linux 2.4.0-test3-pre8. Linus has accepted most of what
I've sent him, so we're very close to full integration of the MIPS port into his sources.
Diffstat (limited to 'net')
-rw-r--r--net/core/neighbour.c21
-rw-r--r--net/core/sock.c23
-rw-r--r--net/decnet/dn_route.c5
-rw-r--r--net/ipv4/Config.in9
-rw-r--r--net/ipv4/arp.c22
-rw-r--r--net/ipv4/ip_fragment.c793
-rw-r--r--net/ipv4/ip_gre.c51
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipip.c32
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c38
-rw-r--r--net/ipv4/route.c55
-rw-r--r--net/ipv4/tcp.c64
-rw-r--r--net/ipv4/udp.c5
-rw-r--r--net/ipv6/datagram.c4
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/raw.c33
-rw-r--r--net/ipv6/reassembly.c675
-rw-r--r--net/ipv6/route.c10
-rw-r--r--net/ipv6/sit.c32
-rw-r--r--net/ipv6/udp.c6
-rw-r--r--net/netsyms.c11
-rw-r--r--net/packet/af_packet.c22
-rw-r--r--net/sched/sch_red.c10
-rw-r--r--net/sched/sch_teql.c6
25 files changed, 1112 insertions, 834 deletions
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 88322c8d6..b0d989516 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -281,25 +281,27 @@ struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey,
struct neighbour *n, *n1;
u32 hash_val;
int key_len = tbl->key_len;
+ int error;
n = neigh_alloc(tbl);
if (n == NULL)
- return NULL;
+ return ERR_PTR(-ENOBUFS);
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
- if (tbl->constructor && tbl->constructor(n) < 0) {
+ if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
neigh_release(n);
- return NULL;
+ return ERR_PTR(error);
}
/* Device specific setup. */
- if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) {
+ if (n->parms && n->parms->neigh_setup &&
+ (error = n->parms->neigh_setup(n)) < 0) {
neigh_release(n);
- return NULL;
+ return ERR_PTR(error);
}
n->confirmed = jiffies - (n->parms->base_reachable_time<<1);
@@ -1242,6 +1244,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
if (nda[NDA_LLADDR-1] != NULL &&
nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len))
goto out;
+ err = 0;
n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
if (n) {
if (nlh->nlmsg_flags&NLM_F_EXCL)
@@ -1249,9 +1252,11 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
} else if (!(nlh->nlmsg_flags&NLM_F_CREATE))
err = -ENOENT;
else {
- n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1);
- if (n == NULL)
- err = -ENOBUFS;
+ n = __neigh_lookup_errno(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+ if (IS_ERR(n)) {
+ err = PTR_ERR(n);
+ n = NULL;
+ }
}
if (err == 0) {
err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL,
diff --git a/net/core/sock.c b/net/core/sock.c
index 4044a7f7d..fcb6246b3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -7,7 +7,7 @@
* handler for protocols to use and generic option handler.
*
*
- * Version: $Id: sock.c,v 1.93 2000/04/13 03:13:29 davem Exp $
+ * Version: $Id: sock.c,v 1.95 2000/07/08 00:20:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -308,6 +308,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock->passcred = valbool;
break;
+ case SO_TIMESTAMP:
+ sk->rcvtstamp = valbool;
+ break;
+
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
@@ -485,7 +489,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BSDCOMPAT:
v.val = sk->bsdism;
break;
-
+
+ case SO_TIMESTAMP:
+ v.val = sk->rcvtstamp;
+ break;
+
case SO_RCVTIMEO:
lv=sizeof(struct timeval);
if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
@@ -599,7 +607,16 @@ void __init sk_init(void)
{
sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
SLAB_HWCACHE_ALIGN, 0, 0);
-
+
+ if (num_physpages <= 4096) {
+ sysctl_wmem_max = 32767;
+ sysctl_rmem_max = 32767;
+ sysctl_wmem_default = 32767;
+ sysctl_wmem_default = 32767;
+ } else if (num_physpages >= 131072) {
+ sysctl_wmem_max = 131071;
+ sysctl_rmem_max = 131071;
+ }
}
/*
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5ce55ebb2..d97558a24 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -789,13 +789,14 @@ static int dn_route_input_slow(struct sk_buff *skb)
if (dn_db->router && ((neigh = neigh_clone(dn_db->router)) != NULL))
goto add_entry;
- if ((neigh = neigh_create(&dn_neigh_table, &cb->src, dev)) != NULL) {
+ neigh = neigh_create(&dn_neigh_table, &cb->src, dev);
+ if (!IS_ERR(neigh)) {
if (dev->type == ARPHRD_ETHER)
memcpy(neigh->ha, skb->mac.ethernet->h_source, ETH_ALEN);
goto add_entry;
}
- return -ENOBUFS;
+ return PTR_ERR(neigh);
non_local_input:
diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in
index 68fea0272..7a44fa565 100644
--- a/net/ipv4/Config.in
+++ b/net/ipv4/Config.in
@@ -44,15 +44,8 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
bool ' IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD
fi
fi
+bool ' IP: TCP Explicit Congestion Notification support' CONFIG_INET_ECN
bool ' IP: TCP syncookie support (disabled per default)' CONFIG_SYN_COOKIES
-comment '(it is safe to leave these untouched)'
-#bool ' IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP
-#bool ' IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY
-#bool ' IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF
-bool ' IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE
-#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-# bool ' IP: support checksum copy to user for UDP (EXPERIMENTAL)' CONFIG_UDP_DELAY_CSUM
-#fi
if [ "$CONFIG_NETFILTER" != "n" ]; then
source net/ipv4/netfilter/Config.in
fi
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 9def6b16b..81d8ebe80 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.86 2000/04/26 09:36:36 davem Exp $
+ * Version: $Id: arp.c,v 1.87 2000/07/07 22:40:35 davem Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -424,20 +424,24 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
int arp_bind_neighbour(struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
+ struct neighbour *n = dst->neighbour;
if (dev == NULL)
- return 0;
- if (dst->neighbour == NULL) {
+ return -EINVAL;
+ if (n == NULL) {
u32 nexthop = ((struct rtable*)dst)->rt_gateway;
if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
nexthop = 0;
- dst->neighbour = __neigh_lookup(
+ n = __neigh_lookup_errno(
#ifdef CONFIG_ATM_CLIP
dev->type == ARPHRD_ATM ? &clip_tbl :
#endif
- &arp_tbl, &nexthop, dev, 1);
+ &arp_tbl, &nexthop, dev);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
+ dst->neighbour = n;
}
- return (dst->neighbour != NULL);
+ return 0;
}
/*
@@ -847,9 +851,9 @@ int arp_req_set(struct arpreq *r, struct net_device * dev)
if (r->arp_ha.sa_family != dev->type)
return -EINVAL;
- err = -ENOBUFS;
- neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1);
- if (neigh) {
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
unsigned state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 70f8cfb90..07041a3e5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
*
* The IP fragmentation functionality.
*
- * Version: $Id: ip_fragment.c,v 1.49 2000/04/15 01:48:10 davem Exp $
+ * Version: $Id: ip_fragment.c,v 1.50 2000/07/07 22:29:42 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox <Alan.Cox@linux.org>
@@ -18,6 +18,7 @@
* Ultima : ip_expire() kernel panic.
* Bill Hawes : Frag accounting and evictor fixes.
* John McDonald : 0 length frag bug.
+ * Alexey Kuznetsov: SMP races, threading, cleanup.
*/
#include <linux/config.h>
@@ -31,11 +32,17 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
+#include <net/checksum.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
/* Fragment cache limits. We will commit 256K at one time. Should we
* cross that limit we will prune down to 192K. This should cope with
* even the most extreme cases without allowing an attacker to measurably
@@ -46,38 +53,77 @@ int sysctl_ipfrag_low_thresh = 192*1024;
int sysctl_ipfrag_time = IP_FRAG_TIME;
-/* Describe an IP fragment. */
-struct ipfrag {
- int offset; /* offset of fragment in IP datagram */
- int end; /* last byte of data in datagram */
- int len; /* length of this fragment */
- struct sk_buff *skb; /* complete received fragment */
- unsigned char *ptr; /* pointer into real fragment data */
- struct ipfrag *next; /* linked list pointers */
- struct ipfrag *prev;
+struct ipfrag_skb_cb
+{
+ struct inet_skb_parm h;
+ int offset;
};
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
+
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
- struct iphdr *iph; /* pointer to IP header */
struct ipq *next; /* linked list pointers */
- struct ipfrag *fragments; /* linked list of received fragments */
+ u32 saddr;
+ u32 daddr;
+ u16 id;
+ u8 protocol;
+ u8 last_in;
+#define COMPLETE 4
+#define FIRST_IN 2
+#define LAST_IN 1
+
+ struct sk_buff *fragments; /* linked list of received fragments */
int len; /* total length of original datagram */
- short ihlen; /* length of the IP header */
+ int meat;
+ spinlock_t lock;
+ atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
struct ipq **pprev;
- struct net_device *dev; /* Device - for icmp replies */
+ struct net_device *dev; /* Device - for icmp replies */
};
+/* Hash table. */
+
#define IPQ_HASHSZ 64
+/* Per-bucket lock is easy to add now. */
static struct ipq *ipq_hash[IPQ_HASHSZ];
-static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED;
+static rwlock_t ipfrag_lock = RW_LOCK_UNLOCKED;
+int ip_frag_nqueues = 0;
-#define ipqhashfn(id, saddr, daddr, prot) \
- ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
+static __inline__ void __ipq_unlink(struct ipq *qp)
+{
+ if(qp->next)
+ qp->next->pprev = qp->pprev;
+ *qp->pprev = qp->next;
+ ip_frag_nqueues--;
+}
+
+static __inline__ void ipq_unlink(struct ipq *ipq)
+{
+ write_lock(&ipfrag_lock);
+ __ipq_unlink(ipq);
+ write_unlock(&ipfrag_lock);
+}
-static atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
+/*
+ * Was: ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
+ *
+ * I see, I see evil hand of bigendian mafia. On Intel all the packets hit
+ * one hash bucket with this hash function. 8)
+ */
+static __inline__ unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
+{
+ unsigned int h = saddr ^ daddr;
+
+ h ^= (h>>16)^id;
+ h ^= (h>>8)^prot;
+ return h & (IPQ_HASHSZ - 1);
+}
+
+
+atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
/* Memory Tracking Functions. */
extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
@@ -86,112 +132,106 @@ extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
kfree_skb(skb);
}
-extern __inline__ void frag_kfree_s(void *ptr, int len)
+extern __inline__ void frag_free_queue(struct ipq *qp)
{
- atomic_sub(len, &ip_frag_mem);
- kfree(ptr);
+ atomic_sub(sizeof(struct ipq), &ip_frag_mem);
+ kfree(qp);
}
-
-extern __inline__ void *frag_kmalloc(int size, int pri)
+
+extern __inline__ struct ipq *frag_alloc_queue(void)
{
- void *vp = kmalloc(size, pri);
+ struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
- if(!vp)
+ if(!qp)
return NULL;
- atomic_add(size, &ip_frag_mem);
- return vp;
+ atomic_add(sizeof(struct ipq), &ip_frag_mem);
+ return qp;
}
-
-/* Create a new fragment entry. */
-static struct ipfrag *ip_frag_create(int offset, int end,
- struct sk_buff *skb, unsigned char *ptr)
+
+
+/* Destruction primitives. */
+
+/* Complete destruction of ipq. */
+static void ip_frag_destroy(struct ipq *qp)
{
- struct ipfrag *fp;
+ struct sk_buff *fp;
- fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
- if (fp == NULL)
- goto out_nomem;
+ BUG_TRAP(qp->last_in&COMPLETE);
+ BUG_TRAP(del_timer(&qp->timer) == 0);
- /* Fill in the structure. */
- fp->offset = offset;
- fp->end = end;
- fp->len = end - offset;
- fp->skb = skb;
- fp->ptr = ptr;
- fp->next = fp->prev = NULL;
-
- /* Charge for the SKB as well. */
- atomic_add(skb->truesize, &ip_frag_mem);
+ /* Release all fragment data. */
+ fp = qp->fragments;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
- return(fp);
+ frag_kfree_skb(fp);
+ fp = xp;
+ }
-out_nomem:
- NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
- return(NULL);
+ /* Finally, release the queue descriptor itself. */
+ frag_free_queue(qp);
}
-/* Find the correct entry in the "incomplete datagrams" queue for
- * this IP datagram, and return the queue entry address if found.
- */
-static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
+static __inline__ void ipq_put(struct ipq *ipq)
{
- __u16 id = iph->id;
- __u32 saddr = iph->saddr;
- __u32 daddr = iph->daddr;
- __u8 protocol = iph->protocol;
- unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
- struct ipq *qp;
-
- /* We are always in BH context, and protected by the
- * ipfrag lock.
- */
- for(qp = ipq_hash[hash]; qp; qp = qp->next) {
- if(qp->iph->id == id &&
- qp->iph->saddr == saddr &&
- qp->iph->daddr == daddr &&
- qp->iph->protocol == protocol) {
- del_timer(&qp->timer);
- break;
- }
- }
- return qp;
+ if (atomic_dec_and_test(&ipq->refcnt))
+ ip_frag_destroy(ipq);
}
-/* Remove an entry from the "incomplete datagrams" queue, either
- * because we completed, reassembled and processed it, or because
- * it timed out.
- *
- * This is called _only_ from BH contexts with the ipfrag lock held,
- * on packet reception processing and from frag queue expiration
- * timers. -DaveM
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
*/
-static void ip_free(struct ipq *qp)
+static __inline__ void ipq_kill(struct ipq *ipq)
{
- struct ipfrag *fp;
-
- /* Stop the timer for this entry. */
- del_timer(&qp->timer);
-
- /* Remove this entry from the "incomplete datagrams" queue. */
- if(qp->next)
- qp->next->pprev = qp->pprev;
- *qp->pprev = qp->next;
-
- /* Release all fragment data. */
- fp = qp->fragments;
- while (fp) {
- struct ipfrag *xp = fp->next;
+ if (del_timer(&ipq->timer))
+ atomic_dec(&ipq->refcnt);
- frag_kfree_skb(fp->skb);
- frag_kfree_s(fp, sizeof(struct ipfrag));
- fp = xp;
+ if (!(ipq->last_in & COMPLETE)) {
+ ipq_unlink(ipq);
+ atomic_dec(&ipq->refcnt);
+ ipq->last_in |= COMPLETE;
}
+}
- /* Release the IP header. */
- frag_kfree_s(qp->iph, 64 + 8);
+/* Memory limiting on fragments. Evictor trashes the oldest
+ * fragment queue until we are back under the low threshold.
+ */
+static void ip_evictor(void)
+{
+ int i, progress;
- /* Finally, release the queue descriptor itself. */
- frag_kfree_s(qp, sizeof(struct ipq));
+ do {
+ if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
+ return;
+ progress = 0;
+ /* FIXME: Make LRU queue of frag heads. -DaveM */
+ for (i = 0; i < IPQ_HASHSZ; i++) {
+ struct ipq *qp;
+ if (ipq_hash[i] == NULL)
+ continue;
+
+ write_lock(&ipfrag_lock);
+ if ((qp = ipq_hash[i]) != NULL) {
+ /* find the oldest queue for this hash bucket */
+ while (qp->next)
+ qp = qp->next;
+ __ipq_unlink(qp);
+ write_unlock(&ipfrag_lock);
+
+ spin_lock(&qp->lock);
+ if (del_timer(&qp->timer))
+ atomic_dec(&qp->refcnt);
+ qp->last_in |= COMPLETE;
+ spin_unlock(&qp->lock);
+
+ ipq_put(qp);
+ IP_INC_STATS_BH(IpReasmFails);
+ progress = 1;
+ continue;
+ }
+ write_unlock(&ipfrag_lock);
+ }
+ } while (progress);
}
/*
@@ -201,181 +241,310 @@ static void ip_expire(unsigned long arg)
{
struct ipq *qp = (struct ipq *) arg;
- spin_lock(&ipfrag_lock);
- if(!qp->fragments)
- {
-#ifdef IP_EXPIRE_DEBUG
- printk("warning: possible ip-expire attack\n");
-#endif
+ spin_lock(&qp->lock);
+
+ if (qp->last_in & COMPLETE)
goto out;
- }
-
- /* Send an ICMP "Fragment Reassembly Timeout" message. */
+
+ ipq_kill(qp);
+
IP_INC_STATS_BH(IpReasmTimeout);
IP_INC_STATS_BH(IpReasmFails);
- icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+ if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ icmp_send(qp->fragments, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+ }
out:
- /* Nuke the fragment queue. */
- ip_free(qp);
- spin_unlock(&ipfrag_lock);
+ spin_unlock(&qp->lock);
+ ipq_put(qp);
}
-/* Memory limiting on fragments. Evictor trashes the oldest
- * fragment queue until we are back under the low threshold.
- *
- * We are always called in BH with the ipfrag lock held.
- */
-static void ip_evictor(void)
+/* Creation primitives. */
+
+static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
{
- int i, progress;
+ struct ipq *qp;
-restart:
- progress = 0;
- /* FIXME: Make LRU queue of frag heads. -DaveM */
- for (i = 0; i < IPQ_HASHSZ; i++) {
- struct ipq *qp;
- if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
- return;
- qp = ipq_hash[i];
- if (qp) {
- /* find the oldest queue for this hash bucket */
- while (qp->next)
- qp = qp->next;
- ip_free(qp);
- progress = 1;
+ write_lock(&ipfrag_lock);
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could be created on other cpu, while we
+ * promoted read lock to write lock.
+ */
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == qp_in->id &&
+ qp->saddr == qp_in->saddr &&
+ qp->daddr == qp_in->daddr &&
+ qp->protocol == qp_in->protocol) {
+ atomic_inc(&qp->refcnt);
+ write_unlock(&ipfrag_lock);
+ qp_in->last_in |= COMPLETE;
+ ipq_put(qp_in);
+ return qp;
}
}
- if (progress)
- goto restart;
- panic("ip_evictor: memcount");
+#endif
+ qp = qp_in;
+
+ atomic_inc(&qp->refcnt);
+ if((qp->next = ipq_hash[hash]) != NULL)
+ qp->next->pprev = &qp->next;
+ ipq_hash[hash] = qp;
+ qp->pprev = &ipq_hash[hash];
+ ip_frag_nqueues++;
+ write_unlock(&ipfrag_lock);
+ return qp;
}
-/* Add an entry to the 'ipq' queue for a newly received IP datagram.
- * We will (hopefully :-) receive all other fragments of this datagram
- * in time, so we just create a queue for this datagram, in which we
- * will insert the received fragments at their respective positions.
- */
-static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
+/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph)
{
struct ipq *qp;
- unsigned int hash;
- int ihlen;
- qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
- if (qp == NULL)
+ if ((qp = frag_alloc_queue()) == NULL)
goto out_nomem;
- /* Allocate memory for the IP header (plus 8 octets for ICMP). */
- ihlen = iph->ihl * 4;
-
- qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
- if (qp->iph == NULL)
- goto out_free;
-
- memcpy(qp->iph, iph, ihlen + 8);
+ qp->protocol = iph->protocol;
+ qp->last_in = 0;
+ qp->id = iph->id;
+ qp->saddr = iph->saddr;
+ qp->daddr = iph->daddr;
qp->len = 0;
- qp->ihlen = ihlen;
+ qp->meat = 0;
qp->fragments = NULL;
- qp->dev = skb->dev;
/* Initialize a timer for this entry. */
init_timer(&qp->timer);
- qp->timer.expires = 0; /* (to be set later) */
qp->timer.data = (unsigned long) qp; /* pointer to queue */
qp->timer.function = ip_expire; /* expire function */
+ qp->lock = SPIN_LOCK_UNLOCKED;
+ atomic_set(&qp->refcnt, 1);
- /* Add this entry to the queue. */
- hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+ return ip_frag_intern(hash, qp);
- /* In a BH context and ipfrag lock is held. -DaveM */
- if((qp->next = ipq_hash[hash]) != NULL)
- qp->next->pprev = &qp->next;
- ipq_hash[hash] = qp;
- qp->pprev = &ipq_hash[hash];
+out_nomem:
+ NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ return NULL;
+}
- return qp;
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph)
+{
+ __u16 id = iph->id;
+ __u32 saddr = iph->saddr;
+ __u32 daddr = iph->daddr;
+ __u8 protocol = iph->protocol;
+ unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+ struct ipq *qp;
-out_free:
- frag_kfree_s(qp, sizeof(struct ipq));
-out_nomem:
- NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
- return(NULL);
+ read_lock(&ipfrag_lock);
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == id &&
+ qp->saddr == saddr &&
+ qp->daddr == daddr &&
+ qp->protocol == protocol) {
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+ return qp;
+ }
+ }
+ read_unlock(&ipfrag_lock);
+
+ return ip_frag_create(hash, iph);
}
-/* See if a fragment queue is complete. */
-static int ip_done(struct ipq *qp)
+/* Add new segment to existing queue. */
+static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct ipfrag *fp;
- int offset;
+ struct iphdr *iph = skb->nh.iph;
+ struct sk_buff *prev, *next;
+ int flags, offset;
+ int ihl, end;
- /* Only possible if we received the final fragment. */
- if (qp->len == 0)
- return 0;
+ if (qp->last_in & COMPLETE)
+ goto err;
- /* Check all fragment offsets to see if they connect. */
- fp = qp->fragments;
- offset = 0;
- while (fp) {
- if (fp->offset > offset)
- return(0); /* fragment(s) missing */
- offset = fp->end;
- fp = fp->next;
+ if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
+ atomic_inc(&qp->refcnt);
+
+ offset = ntohs(iph->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = iph->ihl * 4;
+
+ /* Determine the position of this fragment. */
+ end = offset + (ntohs(iph->tot_len) - ihl);
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrrupted.
+ */
+ if (end < qp->len ||
+ ((qp->last_in & LAST_IN) && end != qp->len))
+ goto err;
+ qp->last_in |= LAST_IN;
+ qp->len = end;
+ } else {
+ if (end&7) {
+ end &= ~7;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (end > qp->len) {
+ /* Some bits beyond end -> corruption. */
+ if (qp->last_in & LAST_IN)
+ goto err;
+ qp->len = end;
+ }
}
+ if (end == offset)
+ goto err;
+
+ /* Point into the IP datagram 'data' part. */
+ skb_pull(skb, (skb->nh.raw+ihl) - skb->data);
+ skb_trim(skb, end - offset);
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for(next = qp->fragments; next != NULL; next = next->next) {
+ if (FRAG_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ if (end <= offset)
+ goto err;
+ skb_pull(skb, i);
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ while (next && FRAG_CB(next)->offset < end) {
+ int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ FRAG_CB(next)->offset += i;
+ skb_pull(next, i);
+ qp->meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
- /* All fragments are present. */
- return 1;
+ /* Old fragmnet is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ qp->fragments = next;
+
+ qp->meat -= free_it->len;
+ frag_kfree_skb(free_it);
+ }
+ }
+
+ FRAG_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (prev)
+ prev->next = skb;
+ else
+ qp->fragments = skb;
+
+ qp->dev = skb->dev;
+ qp->meat += skb->len;
+ atomic_add(skb->truesize, &ip_frag_mem);
+ if (offset == 0)
+ qp->last_in |= FIRST_IN;
+
+ return;
+
+err:
+ kfree_skb(skb);
}
+
/* Build a new IP datagram from all its fragments.
*
* FIXME: We copy here because we lack an effective way of handling lists
* of bits on input. Until the new skb data handling is in I'm not going
* to touch this with a bargepole.
*/
-static struct sk_buff *ip_glue(struct ipq *qp)
+static struct sk_buff *ip_frag_reasm(struct ipq *qp)
{
struct sk_buff *skb;
struct iphdr *iph;
- struct ipfrag *fp;
- unsigned char *ptr;
- int count, len;
+ struct sk_buff *fp, *head = qp->fragments;
+ int len;
+ int ihlen;
+
+ ipq_kill(qp);
+
+ BUG_TRAP(head != NULL);
+ BUG_TRAP(FRAG_CB(head)->offset == 0);
/* Allocate a new buffer for the datagram. */
- len = qp->ihlen + qp->len;
-
+ ihlen = head->nh.iph->ihl*4;
+ len = ihlen + qp->len;
+
if(len > 65535)
goto out_oversize;
-
+
skb = dev_alloc_skb(len);
if (!skb)
goto out_nomem;
/* Fill in the basic details. */
- skb->mac.raw = ptr = skb->data;
- skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len);
+ skb->mac.raw = skb->data;
+ skb->nh.raw = skb->data;
+ FRAG_CB(skb)->h = FRAG_CB(head)->h;
+ skb->ip_summed = head->ip_summed;
+ skb->csum = 0;
/* Copy the original IP headers into the new buffer. */
- memcpy(ptr, qp->iph, qp->ihlen);
- ptr += qp->ihlen;
+ memcpy(skb_put(skb, ihlen), head->nh.iph, ihlen);
/* Copy the data portions of all fragments into the new buffer. */
- fp = qp->fragments;
- count = qp->ihlen;
- while(fp) {
- if ((fp->len <= 0) || ((count + fp->len) > skb->len))
- goto out_invalid;
- memcpy((ptr + fp->offset), fp->ptr, fp->len);
- if (count == qp->ihlen) {
- skb->dst = dst_clone(fp->skb->dst);
- skb->dev = fp->skb->dev;
- }
- count += fp->len;
- fp = fp->next;
+ for (fp=head; fp; fp = fp->next) {
+ memcpy(skb_put(skb, fp->len), fp->data, fp->len);
+
+ if (skb->ip_summed != fp->ip_summed)
+ skb->ip_summed = CHECKSUM_NONE;
+ else if (skb->ip_summed == CHECKSUM_HW)
+ skb->csum = csum_chain(skb->csum, fp->csum);
}
- skb->pkt_type = qp->fragments->skb->pkt_type;
- skb->protocol = qp->fragments->skb->protocol;
+ skb->dst = dst_clone(head->dst);
+ skb->pkt_type = head->pkt_type;
+ skb->protocol = head->protocol;
+ skb->dev = qp->dev;
+
/*
* Clearly bogus, because security markings of the individual
* fragments should have been checked for consistency before
@@ -385,29 +554,24 @@ static struct sk_buff *ip_glue(struct ipq *qp)
* as well take the value associated with the first fragment.
* --rct
*/
- skb->security = qp->fragments->skb->security;
+ skb->security = head->security;
#ifdef CONFIG_NETFILTER
/* Connection association is same as fragment (if any). */
- skb->nfct = qp->fragments->skb->nfct;
+ skb->nfct = head->nfct;
nf_conntrack_get(skb->nfct);
#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug = qp->fragments->skb->nf_debug;
+ skb->nf_debug = head->nf_debug;
#endif
#endif
/* Done with all fragments. Fixup the new IP header. */
iph = skb->nh.iph;
iph->frag_off = 0;
- iph->tot_len = htons(count);
+ iph->tot_len = htons(len);
IP_INC_STATS_BH(IpReasmOKs);
return skb;
-out_invalid:
- NETDEBUG(printk(KERN_ERR
- "Invalid fragment list: Fragment over size.\n"));
- kfree_skb(skb);
- goto out_fail;
out_nomem:
NETDEBUG(printk(KERN_ERR
"IP: queue_glue: no memory for gluing queue %p\n",
@@ -417,7 +581,7 @@ out_oversize:
if (net_ratelimit())
printk(KERN_INFO
"Oversized IP packet from %d.%d.%d.%d.\n",
- NIPQUAD(qp->iph->saddr));
+ NIPQUAD(qp->saddr));
out_fail:
IP_INC_STATS_BH(IpReasmFails);
return NULL;
@@ -427,185 +591,32 @@ out_fail:
struct sk_buff *ip_defrag(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
- struct ipfrag *prev, *next, *tmp, *tfp;
struct ipq *qp;
- unsigned char *ptr;
- int flags, offset;
- int i, ihl, end;
IP_INC_STATS_BH(IpReasmReqds);
- spin_lock(&ipfrag_lock);
-
/* Start by cleaning up the memory. */
if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
ip_evictor();
- /*
- * Look for the entry for this IP datagram in the
- * "incomplete datagrams" queue. If found, the
- * timer is removed.
- */
- qp = ip_find(iph, skb->dst);
-
- /* Is this a non-fragmented datagram? */
- offset = ntohs(iph->frag_off);
- flags = offset & ~IP_OFFSET;
- offset &= IP_OFFSET;
+ /* Lookup (or create) queue header */
+ if ((qp = ip_find(iph)) != NULL) {
+ struct sk_buff *ret = NULL;
- offset <<= 3; /* offset is in 8-byte chunks */
- ihl = iph->ihl * 4;
+ spin_lock(&qp->lock);
- /*
- * Check whether to create a fresh queue entry. If the
- * queue already exists, its timer will be restarted as
- * long as we continue to receive fragments.
- */
- if (qp) {
- /* ANK. If the first fragment is received,
- * we should remember the correct IP header (with options)
- */
- if (offset == 0) {
- /* Fragmented frame replaced by unfragmented copy? */
- if ((flags & IP_MF) == 0)
- goto out_freequeue;
- qp->ihlen = ihl;
- memcpy(qp->iph, iph, (ihl + 8));
- }
- } else {
- /* Fragmented frame replaced by unfragmented copy? */
- if ((offset == 0) && ((flags & IP_MF) == 0))
- goto out_skb;
-
- /* If we failed to create it, then discard the frame. */
- qp = ip_create(skb, iph);
- if (!qp)
- goto out_freeskb;
- }
-
- /* Attempt to construct an oversize packet. */
- if((ntohs(iph->tot_len) + ((int) offset)) > 65535)
- goto out_oversize;
+ ip_frag_queue(qp, skb);
- /* Determine the position of this fragment. */
- end = offset + ntohs(iph->tot_len) - ihl;
+ if (qp->last_in == (FIRST_IN|LAST_IN) &&
+ qp->meat == qp->len)
+ ret = ip_frag_reasm(qp);
- /* Is this the final fragment? */
- if ((flags & IP_MF) == 0)
- qp->len = end;
-
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = NULL;
- for(next = qp->fragments; next != NULL; next = next->next) {
- if (next->offset >= offset)
- break; /* bingo! */
- prev = next;
+ spin_unlock(&qp->lock);
+ ipq_put(qp);
+ return ret;
}
- /* Point into the IP datagram 'data' part. */
- ptr = skb->data + ihl;
-
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
- */
- if ((prev != NULL) && (offset < prev->end)) {
- i = prev->end - offset;
- offset += i; /* ptr into datagram */
- ptr += i; /* ptr into fragment data */
- }
-
- /* Look for overlap with succeeding segments.
- * If we can merge fragments, do it.
- */
- for (tmp = next; tmp != NULL; tmp = tfp) {
- tfp = tmp->next;
- if (tmp->offset >= end)
- break; /* no overlaps at all */
-
- i = end - next->offset; /* overlap is 'i' bytes */
- tmp->len -= i; /* so reduce size of */
- tmp->offset += i; /* next fragment */
- tmp->ptr += i;
-
- /* If we get a frag size of <= 0, remove it and the packet
- * that it goes with.
- */
- if (tmp->len <= 0) {
- if (tmp->prev != NULL)
- tmp->prev->next = tmp->next;
- else
- qp->fragments = tmp->next;
-
- if (tmp->next != NULL)
- tmp->next->prev = tmp->prev;
-
- /* We have killed the original next frame. */
- next = tfp;
-
- frag_kfree_skb(tmp->skb);
- frag_kfree_s(tmp, sizeof(struct ipfrag));
- }
- }
-
- /*
- * Create a fragment to hold this skb.
- * No memory to save the fragment? throw the lot ...
- */
- tfp = ip_frag_create(offset, end, skb, ptr);
- if (!tfp)
- goto out_freeskb;
-
- /* Insert this fragment in the chain of fragments. */
- tfp->prev = prev;
- tfp->next = next;
- if (prev != NULL)
- prev->next = tfp;
- else
- qp->fragments = tfp;
-
- if (next != NULL)
- next->prev = tfp;
-
- /* OK, so we inserted this new fragment into the chain.
- * Check if we now have a full IP datagram which we can
- * bump up to the IP layer...
- */
- if (ip_done(qp)) {
- /* Glue together the fragments. */
- skb = ip_glue(qp);
- /* Free the queue entry. */
-out_freequeue:
- ip_free(qp);
-out_skb:
- spin_unlock(&ipfrag_lock);
- return skb;
- }
-
- /*
- * The queue is still active ... reset its timer.
- */
-out_timer:
- mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
-out:
- spin_unlock(&ipfrag_lock);
- return NULL;
-
- /*
- * Error exits ... we need to reset the timer if there's a queue.
- */
-out_oversize:
- if (net_ratelimit())
- printk(KERN_INFO "Oversized packet received from %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr));
- /* the skb isn't in a fragment, so fall through to free it */
-out_freeskb:
- kfree_skb(skb);
IP_INC_STATS_BH(IpReasmFails);
- if (qp)
- goto out_timer;
- goto out;
+ kfree_skb(skb);
+ return NULL;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 79dc3d629..a316401b0 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -36,6 +36,7 @@
#include <net/ipip.h>
#include <net/arp.h>
#include <net/checksum.h>
+#include <net/inet_ecn.h>
#ifdef CONFIG_IPV6
#include <net/ipv6.h>
@@ -119,11 +120,11 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_fb_tunnel_init(struct net_device *dev);
static struct net_device ipgre_fb_tunnel_dev = {
- "gre%d", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
+ "gre0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
};
static struct ip_tunnel ipgre_fb_tunnel = {
- NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre%d", }
+ NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
};
/* Tunnel hash table */
@@ -530,6 +531,34 @@ out:
#endif
}
+static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+ if (INET_ECN_is_ce(iph->tos)) {
+ if (skb->protocol == __constant_htons(ETH_P_IP)) {
+ if (INET_ECN_is_not_ce(skb->nh.iph->tos))
+ IP_ECN_set_ce(skb->nh.iph);
+ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+ if (INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h)))
+ IP6_ECN_set_ce(skb->nh.ipv6h);
+ }
+ }
+}
+
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+{
+#ifdef CONFIG_INET_ECN
+ u8 inner = 0;
+ if (skb->protocol == __constant_htons(ETH_P_IP))
+ inner = old_iph->tos;
+ else if (skb->protocol == __constant_htons(ETH_P_IPV6))
+ inner = ip6_get_dsfield((struct ipv6hdr*)old_iph);
+ return INET_ECN_encapsulate(tos, inner);
+#else
+ return tos;
+#endif
+}
+
int ipgre_rcv(struct sk_buff *skb, unsigned short len)
{
struct iphdr *iph = skb->nh.iph;
@@ -604,6 +633,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len)
nf_conntrack_put(skb->nfct);
skb->nfct = NULL;
#endif
+ ipgre_ecn_decapsulate(iph, skb);
netif_rx(skb);
read_unlock(&ipgre_lock);
return(0);
@@ -638,6 +668,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
int gre_hlen;
u32 dst;
int mtu;
+ int err;
if (tunnel->recursion++) {
tunnel->stat.collisions++;
@@ -789,7 +820,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = IPPROTO_GRE;
- iph->tos = tos;
+ iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
@@ -834,10 +865,17 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
skb->nfct = NULL;
#endif
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
+ if(err < 0) {
+ if(net_ratelimit())
+ printk(KERN_ERR "ipgre_tunnel_xmit: ip_send() failed, err=%d\n", -err);
+ skb = NULL;
+ goto tx_error;
+ }
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- do_ip_send);
tunnel->recursion--;
return 0;
@@ -846,7 +884,8 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
- dev_kfree_skb(skb);
+ if(skb)
+ dev_kfree_skb(skb);
tunnel->recursion--;
return 0;
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 90b74447f..aea8b9370 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
*
* The IP to API glue.
*
- * Version: $Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $
+ * Version: $Id: ip_sockglue.c,v 1.49 2000/07/08 00:20:43 davem Exp $
*
* Authors: see ip.c
*
@@ -327,6 +327,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
if (err)
goto out_free_skb;
+ sock_recv_timestamp(msg, sk, skb);
+
serr = SKB_EXT_ERR(skb);
sin = (struct sockaddr_in *)msg->msg_name;
@@ -462,8 +464,15 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
break;
case IP_TOS: /* This sets both TOS and Precedence */
/* Reject setting of unused bits */
+#ifndef CONFIG_INET_ECN
if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK))
goto e_inval;
+#else
+ if (sk->type == SOCK_STREAM) {
+ val &= ~3;
+ val |= sk->protinfo.af_inet.tos & 3;
+ }
+#endif
if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
!capable(CAP_NET_ADMIN)) {
err = -EPERM;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e343f34e8..1177033ca 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.35 2000/07/07 01:55:20 davem Exp $
+ * Version: $Id: ipip.c,v 1.37 2000/07/07 23:47:45 davem Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -114,6 +114,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ipip.h>
+#include <net/inet_ecn.h>
#define HASH_SIZE 16
#define HASH(addr) ((addr^(addr>>4))&0xF)
@@ -122,11 +123,11 @@ static int ipip_fb_tunnel_init(struct net_device *dev);
static int ipip_tunnel_init(struct net_device *dev);
static struct net_device ipip_fb_tunnel_dev = {
- "tunl%d", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init,
+ "tunl0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init,
};
static struct ip_tunnel ipip_fb_tunnel = {
- NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl%d", }
+ NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", }
};
static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
@@ -465,6 +466,13 @@ out:
#endif
}
+static inline void ipip_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+ if (INET_ECN_is_ce(iph->tos) &&
+ INET_ECN_is_not_ce(skb->nh.iph->tos))
+ IP_ECN_set_ce(iph);
+}
+
int ipip_rcv(struct sk_buff *skb, unsigned short len)
{
struct iphdr *iph;
@@ -489,6 +497,7 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
nf_conntrack_put(skb->nfct);
skb->nfct = NULL;
#endif
+ ipip_ecn_decapsulate(iph, skb);
netif_rx(skb);
read_unlock(&ipip_lock);
return 0;
@@ -525,6 +534,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
int max_headroom; /* The extra header space needed */
u32 dst = tiph->daddr;
int mtu;
+ int err;
if (tunnel->recursion++) {
tunnel->stat.collisions++;
@@ -620,7 +630,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = IPPROTO_IPIP;
- iph->tos = tos;
+ iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
@@ -636,10 +646,17 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
skb->nfct = NULL;
#endif
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
+ if(err < 0) {
+ if(net_ratelimit())
+ printk(KERN_ERR "ipip_tunnel_xmit: ip_send() failed, err=%d\n", -err);
+ skb = NULL;
+ goto tx_error;
+ }
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- do_ip_send);
tunnel->recursion--;
return 0;
@@ -647,7 +664,8 @@ tx_error_icmp:
dst_link_failure(skb);
tx_error:
stats->tx_errors++;
- dev_kfree_skb(skb);
+ if(skb)
+ dev_kfree_skb(skb);
tunnel->recursion--;
return 0;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4d94a4cc0..f1ff8f1ee 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
* PROC file system. It is mainly used for debugging and
* statistics.
*
- * Version: $Id: proc.c,v 1.42 2000/04/16 01:11:37 davem Exp $
+ * Version: $Id: proc.c,v 1.43 2000/07/07 22:29:42 davem Exp $
*
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -78,6 +78,8 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length)
fold_prot_inuse(&udp_prot));
len += sprintf(buffer+len,"RAW: inuse %d\n",
fold_prot_inuse(&raw_prot));
+ len += sprintf(buffer+len, "FRAG: inuse %d memory %d\n",
+ ip_frag_nqueues, atomic_read(&ip_frag_mem));
if (offset >= len)
{
*start = buffer;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6e3f19287..5ac30dc40 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.50 2000/05/03 06:37:06 davem Exp $
+ * Version: $Id: raw.c,v 1.52 2000/07/08 00:20:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -502,7 +502,7 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
if (err)
goto done;
- sk->stamp=skb->stamp;
+ sock_recv_timestamp(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -580,6 +580,36 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
}
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = atomic_read(&sk->wmem_alloc);
+ return put_user(amount, (int *)arg);
+ }
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_irq(&sk->receive_queue.lock);
+ skb = skb_peek(&sk->receive_queue);
+ if (skb != NULL)
+ amount = skb->len;
+ spin_unlock_irq(&sk->receive_queue.lock);
+ return put_user(amount, (int *)arg);
+ }
+
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
static void get_raw_sock(struct sock *sp, char *tmpbuf, int i)
{
unsigned int dest, src;
@@ -648,9 +678,7 @@ struct proto raw_prot = {
close: raw_close,
connect: udp_connect,
disconnect: udp_disconnect,
-#ifdef CONFIG_IP_MROUTE
- ioctl: ipmr_ioctl,
-#endif
+ ioctl: raw_ioctl,
init: raw_init,
setsockopt: raw_setsockopt,
getsockopt: raw_getsockopt,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ec254e313..eb00518bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.86 2000/04/24 07:03:14 davem Exp $
+ * Version: $Id: route.c,v 1.88 2000/07/07 23:47:45 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -152,23 +152,29 @@ struct dst_ops ipv4_dst_ops =
sizeof(struct rtable),
};
+#ifdef CONFIG_INET_ECN
+#define ECN_OR_COST(class) TC_PRIO_##class
+#else
+#define ECN_OR_COST(class) TC_PRIO_FILLER
+#endif
+
__u8 ip_tos2prio[16] = {
TC_PRIO_BESTEFFORT,
- TC_PRIO_FILLER,
+ ECN_OR_COST(FILLER),
TC_PRIO_BESTEFFORT,
- TC_PRIO_FILLER,
+ ECN_OR_COST(BESTEFFORT),
TC_PRIO_BULK,
- TC_PRIO_FILLER,
+ ECN_OR_COST(BULK),
TC_PRIO_BULK,
- TC_PRIO_FILLER,
+ ECN_OR_COST(BULK),
TC_PRIO_INTERACTIVE,
- TC_PRIO_FILLER,
+ ECN_OR_COST(INTERACTIVE),
TC_PRIO_INTERACTIVE,
- TC_PRIO_FILLER,
+ ECN_OR_COST(INTERACTIVE),
TC_PRIO_INTERACTIVE_BULK,
- TC_PRIO_FILLER,
+ ECN_OR_COST(INTERACTIVE_BULK),
TC_PRIO_INTERACTIVE_BULK,
- TC_PRIO_FILLER
+ ECN_OR_COST(INTERACTIVE_BULK)
};
@@ -582,9 +588,15 @@ restart:
route or unicast forwarding path.
*/
if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
- if (!arp_bind_neighbour(&rt->u.dst)) {
+ int err = arp_bind_neighbour(&rt->u.dst);
+ if (err) {
write_unlock_bh(&rt_hash_table[hash].lock);
+ if (err != -ENOBUFS) {
+ rt_drop(rt);
+ return err;
+ }
+
/* Neighbour tables are full and nothing
can be released. Try to shrink route cache,
it is most likely it holds some neighbour records.
@@ -600,13 +612,8 @@ restart:
goto restart;
}
- if (net_ratelimit()) {
- if ((rt->u.dst.dev->flags&IFF_UP) &&
- __in_dev_get(rt->u.dst.dev))
- printk("Neighbour table overflow.\n");
- else
- printk("Device %s is down.\n", rt->u.dst.dev->name);
- }
+ if (net_ratelimit())
+ printk("Neighbour table overflow.\n");
rt_drop(rt);
return -ENOBUFS;
}
@@ -712,7 +719,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
u32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
- tos &= IPTOS_TOS_MASK;
+ tos &= IPTOS_RT_MASK;
if (!in_dev)
return;
@@ -791,7 +798,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
- if (!arp_bind_neighbour(&rt->u.dst) ||
+ if (arp_bind_neighbour(&rt->u.dst) ||
!(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
if (rt->u.dst.neighbour)
neigh_event_send(rt->u.dst.neighbour, NULL);
@@ -967,7 +974,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
struct rtable *rth;
u32 skeys[2] = { iph->saddr, 0, };
u32 daddr = iph->daddr;
- u8 tos = iph->tos & IPTOS_TOS_MASK;
+ u8 tos = iph->tos & IPTOS_RT_MASK;
unsigned short est_mtu = 0;
if (ipv4_config.no_pmtu_disc)
@@ -1546,7 +1553,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
unsigned hash;
int iif = dev->ifindex;
- tos &= IPTOS_TOS_MASK;
+ tos &= IPTOS_RT_MASK;
hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
read_lock(&rt_hash_table[hash].lock);
@@ -1616,10 +1623,10 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
int free_res = 0;
int err;
- tos &= IPTOS_TOS_MASK|RTO_ONLINK;
+ tos &= IPTOS_RT_MASK|RTO_ONLINK;
key.dst = daddr;
key.src = saddr;
- key.tos = tos&IPTOS_TOS_MASK;
+ key.tos = tos&IPTOS_RT_MASK;
key.iif = loopback_dev.ifindex;
key.oif = oif;
key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
@@ -1889,7 +1896,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
rth->key.src == saddr &&
rth->key.iif == 0 &&
rth->key.oif == oif &&
- !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
+ !((rth->key.tos^tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
) {
rth->u.dst.lastuse = jiffies;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4343b707..dbf680233 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.169 2000/04/20 14:41:16 davem Exp $
+ * Version: $Id: tcp.c,v 1.170 2000/07/08 00:20:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -1018,9 +1018,13 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
tcp_send_skb(sk, skb, queue_it, mss_now);
}
}
- sk->err = 0;
err = copied;
- goto out;
+out:
+ __tcp_push_pending_frames(sk, tp, mss_now);
+ TCP_CHECK_TIMER(sk);
+out_unlock:
+ release_sock(sk);
+ return err;
do_sock_err:
if(copied)
@@ -1048,12 +1052,7 @@ do_fault:
kfree_skb(skb);
do_fault2:
err = -EFAULT;
-out:
- __tcp_push_pending_frames(sk, tp, mss_now);
- TCP_CHECK_TIMER(sk);
-out_unlock:
- release_sock(sk);
- return err;
+ goto out;
}
#undef PSH_NEEDED
@@ -1270,10 +1269,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
TCP_CHECK_TIMER(sk);
-
- if (sk->err)
- goto out_err;
-
err = -ENOTCONN;
if (sk->state == TCP_LISTEN)
goto out;
@@ -1292,13 +1287,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- /*
- * BUG BUG BUG
- * This violates 1003.1g compliance. We must wait for
- * data to exist even if we read none!
- */
-
- while (len > 0) {
+ do {
struct sk_buff * skb;
u32 offset;
@@ -1519,29 +1508,6 @@ do_prequeue:
continue;
skb->used = 1;
tcp_eat_skb(sk, skb);
-
-#ifdef TCP_LESS_COARSE_ACKS
- /* Possible improvement. When sender is faster than receiver,
- * traffic looks like: fill window ... wait for window open ...
- * fill window. We lose at least one rtt, because call
- * cleanup_rbuf only once. Probably, if "len" was large
- * we should insert several intermediate cleanup_rbuf(s).
- *
- * F.e.:
- */
- do {
- u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
-
- /* Try to ACK, if total buffer length is larger
- than maximal window and if rcv_window has
- chances to increase twice. It will result
- to exponentially decreased ACKing during
- read to huge (usually, mmapped) buffer.
- */
- if (len >= full_space && tp->rcv_wnd <= full_space/2)
- cleanup_rbuf(sk, copied);
- } while (0);
-#endif
continue;
found_fin_ok:
@@ -1552,7 +1518,7 @@ do_prequeue:
/* All is done. */
skb->used = 1;
break;
- }
+ } while (len > 0);
if (user_recv) {
if (skb_queue_len(&tp->ucopy.prequeue)) {
@@ -1584,9 +1550,6 @@ do_prequeue:
release_sock(sk);
return copied;
-out_err:
- err = sock_error(sk);
-
out:
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -2012,7 +1975,6 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
struct open_request *req;
struct sock *newsk;
int error;
- long timeo;
lock_sock(sk);
@@ -2023,10 +1985,10 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
if (sk->state != TCP_LISTEN)
goto out;
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
/* Find already established connection */
if (!tp->accept_queue) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
@@ -2099,6 +2061,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
break;
}
tp->nonagle = (val == 0) ? 0 : 1;
+ if (val)
+ tcp_push_pending_frames(sk, tp);
break;
case TCP_CORK:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a14c984d7..dec2a6126 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.83 2000/06/09 07:35:49 davem Exp $
+ * Version: $Id: udp.c,v 1.84 2000/07/08 00:20:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -710,7 +710,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
if (err)
goto out_free;
- sk->stamp=skb->stamp;
+
+ sock_recv_timestamp(msg, sk, skb);
/* Copy the address. */
if (sin)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 844ea8228..986cd023f 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: datagram.c,v 1.19 2000/02/27 19:51:47 davem Exp $
+ * $Id: datagram.c,v 1.20 2000/07/08 00:20:43 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -127,6 +127,8 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
if (err)
goto out_free_skb;
+ sock_recv_timestamp(msg, sk, skb);
+
serr = SKB_EXT_ERR(skb);
sin = (struct sockaddr_in6 *)msg->msg_name;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 22c1fa367..944d665d5 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -7,7 +7,7 @@
* PROC file system. This is very similar to the IPv4 version,
* except it reports the sockets in the INET6 address family.
*
- * Version: $Id: proc.c,v 1.14 2000/04/16 01:11:37 davem Exp $
+ * Version: $Id: proc.c,v 1.15 2000/07/07 22:29:42 davem Exp $
*
* Authors: David S. Miller (davem@caip.rutgers.edu)
*
@@ -46,6 +46,8 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d
fold_prot_inuse(&udpv6_prot));
len += sprintf(buffer+len, "RAW6: inuse %d\n",
fold_prot_inuse(&rawv6_prot));
+ len += sprintf(buffer+len, "FRAG6: inuse %d memory %d\n",
+ ip6_frag_nqueues, atomic_read(&ip6_frag_mem));
*start = buffer + offset;
len -= offset;
if(len > length)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 3f2ec7068..e83870421 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -7,7 +7,7 @@
*
* Adapted from linux/net/ipv4/raw.c
*
- * $Id: raw.c,v 1.36 2000/05/03 06:37:07 davem Exp $
+ * $Id: raw.c,v 1.39 2000/07/08 00:20:43 davem Exp $
*
* Fixes:
* Hideaki YOSHIFUJI : sin6_scope_id support
@@ -331,7 +331,6 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
}
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- sk->stamp=skb->stamp;
if (err)
goto out_free;
@@ -348,6 +347,8 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
}
}
+ sock_recv_timestamp(msg, sk, skb);
+
if (sk->net_pinfo.af_inet6.rxopt.all)
datagram_recv_ctl(sk, msg, skb);
err = copied;
@@ -535,6 +536,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
fl.proto = proto;
fl.fl6_dst = daddr;
+ if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr))
+ fl.fl6_src = &np->saddr;
fl.uli_u.icmpt.type = 0;
fl.uli_u.icmpt.code = 0;
@@ -694,6 +697,31 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return 0;
}
+static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = atomic_read(&sk->wmem_alloc);
+ return put_user(amount, (int *)arg);
+ }
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_irq(&sk->receive_queue.lock);
+ skb = skb_peek(&sk->receive_queue);
+ if (skb != NULL)
+ amount = skb->tail - skb->h.raw;
+ spin_unlock_irq(&sk->receive_queue.lock);
+ return put_user(amount, (int *)arg);
+ }
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
static void rawv6_close(struct sock *sk, long timeout)
{
@@ -790,6 +818,7 @@ struct proto rawv6_prot = {
close: rawv6_close,
connect: udpv6_connect,
disconnect: udp_disconnect,
+ ioctl: rawv6_ioctl,
init: rawv6_init_sk,
destroy: inet6_destroy_sock,
setsockopt: rawv6_setsockopt,
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 99f4a702f..abdcdc713 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: reassembly.c,v 1.17 2000/05/03 06:37:07 davem Exp $
+ * $Id: reassembly.c,v 1.18 2000/07/07 22:29:42 davem Exp $
*
* Based on: net/ipv4/ip_fragment.c
*
@@ -21,6 +21,7 @@
* More RFC compliance.
*
* Horst von Brand Add missing #include <linux/string.h>
+ * Alexey Kuznetsov SMP races, threading, cleanup.
*/
#include <linux/errno.h>
#include <linux/types.h>
@@ -46,198 +47,202 @@
int sysctl_ip6frag_high_thresh = 256*1024;
int sysctl_ip6frag_low_thresh = 192*1024;
-int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT;
-
-atomic_t ip6_frag_mem = ATOMIC_INIT(0);
-static spinlock_t ip6_frag_lock = SPIN_LOCK_UNLOCKED;
+int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT;
-struct ipv6_frag {
- __u16 offset;
- __u16 len;
- struct sk_buff *skb;
+struct ip6frag_skb_cb
+{
+ struct inet6_skb_parm h;
+ int offset;
+};
- struct frag_hdr *fhdr;
+#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb))
- struct ipv6_frag *next;
-};
/*
* Equivalent of ipv4 struct ipq
*/
-struct frag_queue {
-
+struct frag_queue
+{
struct frag_queue *next;
- struct frag_queue *prev;
__u32 id; /* fragment id */
struct in6_addr saddr;
struct in6_addr daddr;
+
+ spinlock_t lock;
+ atomic_t refcnt;
struct timer_list timer; /* expire timer */
- struct ipv6_frag *fragments;
- struct net_device *dev;
+ struct sk_buff *fragments;
+ int len;
+ int meat;
+ struct net_device *dev;
int iif;
__u8 last_in; /* has first/last segment arrived? */
+#define COMPLETE 4
#define FIRST_IN 2
#define LAST_IN 1
__u8 nexthdr;
__u16 nhoffset;
+ struct frag_queue **pprev;
};
-static struct frag_queue ipv6_frag_queue = {
- &ipv6_frag_queue, &ipv6_frag_queue,
-};
+/* Hash table. */
-/* Memory Tracking Functions. */
-extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
+#define IP6Q_HASHSZ 64
+
+static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ];
+static rwlock_t ip6_frag_lock = RW_LOCK_UNLOCKED;
+int ip6_frag_nqueues = 0;
+
+static __inline__ void __fq_unlink(struct frag_queue *fq)
{
- atomic_sub(skb->truesize, &ip6_frag_mem);
- kfree_skb(skb);
+ if(fq->next)
+ fq->next->pprev = fq->pprev;
+ *fq->pprev = fq->next;
+ ip6_frag_nqueues--;
}
-extern __inline__ void frag_kfree_s(void *ptr, int len)
+static __inline__ void fq_unlink(struct frag_queue *fq)
{
- atomic_sub(len, &ip6_frag_mem);
- kfree(ptr);
+ write_lock(&ip6_frag_lock);
+ __fq_unlink(fq);
+ write_unlock(&ip6_frag_lock);
}
-
-extern __inline__ void *frag_kmalloc(int size, int pri)
+
+static __inline__ unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr,
+ struct in6_addr *daddr)
{
- void *vp = kmalloc(size, pri);
+ unsigned int h = saddr->s6_addr32[3] ^ daddr->s6_addr32[3] ^ id;
- if(!vp)
- return NULL;
- atomic_add(size, &ip6_frag_mem);
- return vp;
+ h ^= (h>>16);
+ h ^= (h>>8);
+ return h & (IP6Q_HASHSZ - 1);
}
-static void create_frag_entry(struct sk_buff *skb,
- __u8 *nhptr,
- struct frag_hdr *fhdr);
-static u8 * reasm_frag(struct frag_queue *fq,
- struct sk_buff **skb_in);
-
-static void reasm_queue(struct frag_queue *fq,
- struct sk_buff *skb,
- struct frag_hdr *fhdr,
- u8 *nhptr);
-
-static void fq_free(struct frag_queue *fq);
+atomic_t ip6_frag_mem = ATOMIC_INIT(0);
-static void frag_prune(void)
+/* Memory Tracking Functions. */
+extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
{
- struct frag_queue *fq;
-
- spin_lock(&ip6_frag_lock);
- while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) {
- IP6_INC_STATS_BH(Ip6ReasmFails);
- fq_free(fq);
- if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) {
- spin_unlock(&ip6_frag_lock);
- return;
- }
- }
- if (atomic_read(&ip6_frag_mem))
- printk(KERN_DEBUG "IPv6 frag_prune: memleak\n");
- atomic_set(&ip6_frag_mem, 0);
- spin_unlock(&ip6_frag_lock);
+ atomic_sub(skb->truesize, &ip6_frag_mem);
+ kfree_skb(skb);
}
-
-u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr)
+extern __inline__ void frag_free_queue(struct frag_queue *fq)
{
- struct sk_buff *skb = *skbp;
- struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw);
- struct frag_queue *fq;
- struct ipv6hdr *hdr;
-
- hdr = skb->nh.ipv6h;
+ atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem);
+ kfree(fq);
+}
- IP6_INC_STATS_BH(Ip6ReasmReqds);
+extern __inline__ struct frag_queue *frag_alloc_queue(void)
+{
+ struct frag_queue *fq = kmalloc(sizeof(struct frag_queue), GFP_ATOMIC);
- /* Jumbo payload inhibits frag. header */
- if (hdr->payload_len==0) {
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+ if(!fq)
return NULL;
- }
- if ((u8 *)(fhdr+1) > skb->tail) {
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
- return NULL;
- }
- if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
- frag_prune();
+ atomic_add(sizeof(struct frag_queue), &ip6_frag_mem);
+ return fq;
+}
- spin_lock(&ip6_frag_lock);
- for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) {
- if (fq->id == fhdr->identification &&
- !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) &&
- !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) {
- u8 *ret = NULL;
+/* Destruction primitives. */
- reasm_queue(fq, skb, fhdr, nhptr);
+/* Complete destruction of fq. */
+static void ip6_frag_destroy(struct frag_queue *fq)
+{
+ struct sk_buff *fp;
- if (fq->last_in == (FIRST_IN|LAST_IN))
- ret = reasm_frag(fq, skbp);
+ BUG_TRAP(fq->last_in&COMPLETE);
+ BUG_TRAP(del_timer(&fq->timer) == 0);
- spin_unlock(&ip6_frag_lock);
- return ret;
- }
- }
+ /* Release all fragment data. */
+ fp = fq->fragments;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
- create_frag_entry(skb, nhptr, fhdr);
- spin_unlock(&ip6_frag_lock);
+ frag_kfree_skb(fp);
+ fp = xp;
+ }
- return NULL;
+ frag_free_queue(fq);
}
-
-static void fq_free(struct frag_queue *fq)
+static __inline__ void fq_put(struct frag_queue *fq)
{
- struct ipv6_frag *fp, *back;
+ if (atomic_dec_and_test(&fq->refcnt))
+ ip6_frag_destroy(fq);
+}
- del_timer(&fq->timer);
+/* Kill fq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static __inline__ void fq_kill(struct frag_queue *fq)
+{
+ if (del_timer(&fq->timer))
+ atomic_dec(&fq->refcnt);
- for (fp = fq->fragments; fp; ) {
- frag_kfree_skb(fp->skb);
- back = fp;
- fp=fp->next;
- frag_kfree_s(back, sizeof(*back));
+ if (!(fq->last_in & COMPLETE)) {
+ fq_unlink(fq);
+ atomic_dec(&fq->refcnt);
+ fq->last_in |= COMPLETE;
}
+}
- fq->prev->next = fq->next;
- fq->next->prev = fq->prev;
+static void ip6_evictor(void)
+{
+ int i, progress;
- fq->prev = fq->next = NULL;
-
- frag_kfree_s(fq, sizeof(*fq));
+ do {
+ if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh)
+ return;
+ progress = 0;
+ for (i = 0; i < IP6Q_HASHSZ; i++) {
+ struct frag_queue *fq;
+ if (ip6_frag_hash[i] == NULL)
+ continue;
+
+ write_lock(&ip6_frag_lock);
+ if ((fq = ip6_frag_hash[i]) != NULL) {
+ /* find the oldest queue for this hash bucket */
+ while (fq->next)
+ fq = fq->next;
+ __fq_unlink(fq);
+ write_unlock(&ip6_frag_lock);
+
+ spin_lock(&fq->lock);
+ if (del_timer(&fq->timer))
+ atomic_dec(&fq->refcnt);
+ fq->last_in |= COMPLETE;
+ spin_unlock(&fq->lock);
+
+ fq_put(fq);
+ IP6_INC_STATS_BH(Ip6ReasmFails);
+ progress = 1;
+ continue;
+ }
+ write_unlock(&ip6_frag_lock);
+ }
+ } while (progress);
}
-static void frag_expire(unsigned long data)
+static void ip6_frag_expire(unsigned long data)
{
- struct frag_queue *fq;
- struct ipv6_frag *frag;
+ struct frag_queue *fq = (struct frag_queue *) data;
- fq = (struct frag_queue *) data;
+ spin_lock(&fq->lock);
- spin_lock(&ip6_frag_lock);
+ if (fq->last_in & COMPLETE)
+ goto out;
- frag = fq->fragments;
+ fq_kill(fq);
IP6_INC_STATS_BH(Ip6ReasmTimeout);
IP6_INC_STATS_BH(Ip6ReasmFails);
- if (frag == NULL) {
- spin_unlock(&ip6_frag_lock);
- printk(KERN_DEBUG "invalid fragment queue\n");
- return;
- }
-
- /* Send error only if the first segment arrived.
- (fixed --ANK (980728))
- */
- if (fq->last_in&FIRST_IN) {
+ /* Send error only if the first segment arrived. */
+ if (fq->last_in&FIRST_IN && fq->fragments) {
struct net_device *dev = dev_get_by_index(fq->iif);
/*
@@ -246,144 +251,234 @@ static void frag_expire(unsigned long data)
pointer directly, device might already disappeared.
*/
if (dev) {
- frag->skb->dev = dev;
- icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
+ fq->fragments->dev = dev;
+ icmpv6_send(fq->fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
dev);
dev_put(dev);
}
}
-
- fq_free(fq);
- spin_unlock(&ip6_frag_lock);
+out:
+ spin_unlock(&fq->lock);
+ fq_put(fq);
}
+/* Creation primitives. */
-static void create_frag_entry(struct sk_buff *skb,
- __u8 *nhptr,
- struct frag_hdr *fhdr)
+
+static struct frag_queue *ip6_frag_intern(unsigned int hash,
+ struct frag_queue *fq_in)
{
struct frag_queue *fq;
- struct ipv6hdr *hdr;
-
- fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue),
- GFP_ATOMIC);
- if (fq == NULL) {
- IP6_INC_STATS_BH(Ip6ReasmFails);
- kfree_skb(skb);
- return;
+ write_lock(&ip6_frag_lock);
+#ifdef CONFIG_SMP
+ for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ if (fq->id == fq_in->id &&
+ !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) &&
+ !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) {
+ atomic_inc(&fq->refcnt);
+ write_unlock(&ip6_frag_lock);
+ fq_in->last_in |= COMPLETE;
+ fq_put(fq_in);
+ return fq;
+ }
}
+#endif
+ fq = fq_in;
+
+ atomic_inc(&fq->refcnt);
+ if((fq->next = ip6_frag_hash[hash]) != NULL)
+ fq->next->pprev = &fq->next;
+ ip6_frag_hash[hash] = fq;
+ fq->pprev = &ip6_frag_hash[hash];
+ ip6_frag_nqueues++;
+ write_unlock(&ip6_frag_lock);
+ return fq;
+}
- memset(fq, 0, sizeof(struct frag_queue));
- fq->id = fhdr->identification;
+static struct frag_queue *
+ip6_frag_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst)
+{
+ struct frag_queue *fq;
- hdr = skb->nh.ipv6h;
- ipv6_addr_copy(&fq->saddr, &hdr->saddr);
- ipv6_addr_copy(&fq->daddr, &hdr->daddr);
+ if ((fq = frag_alloc_queue()) == NULL)
+ goto oom;
+
+ memset(fq, 0, sizeof(struct frag_queue));
+
+ fq->id = id;
+ ipv6_addr_copy(&fq->saddr, src);
+ ipv6_addr_copy(&fq->daddr, dst);
/* init_timer has been done by the memset */
- fq->timer.function = frag_expire;
+ fq->timer.function = ip6_frag_expire;
fq->timer.data = (long) fq;
- fq->timer.expires = jiffies + sysctl_ip6frag_time;
+ fq->lock = SPIN_LOCK_UNLOCKED;
+ atomic_set(&fq->refcnt, 1);
- reasm_queue(fq, skb, fhdr, nhptr);
+ return ip6_frag_intern(hash, fq);
- if (fq->fragments) {
- fq->prev = ipv6_frag_queue.prev;
- fq->next = &ipv6_frag_queue;
- fq->prev->next = fq;
- ipv6_frag_queue.prev = fq;
-
- add_timer(&fq->timer);
- } else
- frag_kfree_s(fq, sizeof(*fq));
+oom:
+ IP6_INC_STATS_BH(Ip6ReasmFails);
+ return NULL;
}
+static __inline__ struct frag_queue *
+fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
+{
+ struct frag_queue *fq;
+ unsigned int hash = ip6qhashfn(id, src, dst);
+
+ read_lock(&ip6_frag_lock);
+ for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ if (fq->id == id &&
+ !ipv6_addr_cmp(src, &fq->saddr) &&
+ !ipv6_addr_cmp(dst, &fq->daddr)) {
+ atomic_inc(&fq->refcnt);
+ read_unlock(&ip6_frag_lock);
+ return fq;
+ }
+ }
+ read_unlock(&ip6_frag_lock);
+
+ return ip6_frag_create(hash, id, src, dst);
+}
-/*
- * We queue the packet even if it's the last.
- * It's a trade off. This allows the reassembly
- * code to be simpler (=faster) and of the
- * steps we do for queueing the only unnecessary
- * one it's the kmalloc for a struct ipv6_frag.
- * Feel free to try other alternatives...
- */
-static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
- struct frag_hdr *fhdr, u8 *nhptr)
+static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
+ struct frag_hdr *fhdr, u8 *nhptr)
{
- struct ipv6_frag *nfp, *fp, **bptr;
+ struct sk_buff *prev, *next;
+ int offset, end;
- nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag),
- GFP_ATOMIC);
+ if (fq->last_in & COMPLETE)
+ goto err;
- if (nfp == NULL) {
- kfree_skb(skb);
- return;
- }
+ if (!mod_timer(&fq->timer, jiffies + sysctl_ip6frag_time))
+ atomic_inc(&fq->refcnt);
- nfp->offset = ntohs(fhdr->frag_off) & ~0x7;
- nfp->len = (ntohs(skb->nh.ipv6h->payload_len) -
- ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
+ offset = ntohs(fhdr->frag_off) & ~0x7;
+ end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
+ ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
- if ((u32)nfp->offset + (u32)nfp->len >= 65536) {
+ if ((unsigned int)end >= 65536) {
icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off);
goto err;
}
- if (fhdr->frag_off & __constant_htons(0x0001)) {
+
+ /* Is this the final fragment? */
+ if (!(fhdr->frag_off & __constant_htons(0x0001))) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->len ||
+ ((fq->last_in & LAST_IN) && end != fq->len))
+ goto err;
+ fq->last_in |= LAST_IN;
+ fq->len = end;
+ } else {
/* Check if the fragment is rounded to 8 bytes.
* Required by the RFC.
- * ... and would break our defragmentation algorithm 8)
*/
- if (nfp->len & 0x7) {
+ if (end & 0x7) {
printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
/*
It is not in specs, but I see no reasons
to send an error in this case. --ANK
*/
- if (nfp->offset == 0)
+ if (offset == 0)
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
&skb->nh.ipv6h->payload_len);
goto err;
}
+ if (end > fq->len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->last_in & LAST_IN)
+ goto err;
+ fq->len = end;
+ }
}
- nfp->skb = skb;
- nfp->fhdr = fhdr;
- nfp->next = NULL;
+ if (end == offset)
+ goto err;
- bptr = &fq->fragments;
+ /* Point into the IP datagram 'data' part. */
+ skb_pull(skb, (u8 *) (fhdr + 1) - skb->data);
+ skb_trim(skb, end - offset);
- for (fp = fq->fragments; fp; fp=fp->next) {
- if (nfp->offset <= fp->offset)
- break;
- bptr = &fp->next;
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for(next = fq->fragments; next != NULL; next = next->next) {
+ if (FRAG6_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
}
- if (fp && fp->offset == nfp->offset) {
- if (nfp->len != fp->len) {
- printk(KERN_DEBUG "reasm_queue: dup with wrong len\n");
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ if (end <= offset)
+ goto err;
+ skb_pull(skb, i);
}
+ }
- /* duplicate. discard it. */
- goto err;
+ /* Look for overlap with succeeding segments.
+ * If we can merge fragments, do it.
+ */
+ while (next && FRAG6_CB(next)->offset < end) {
+ int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ FRAG6_CB(next)->offset += i; /* next fragment */
+ skb_pull(next, i);
+ fq->meat -= i;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragmnet is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ fq->fragments = next;
+
+ fq->meat -= free_it->len;
+ frag_kfree_skb(free_it);
+ }
}
- atomic_add(skb->truesize, &ip6_frag_mem);
+ FRAG6_CB(skb)->offset = offset;
- /* All the checks are done, fragment is acepted.
- Only now we are allowed to update reassembly data!
- (fixed --ANK (980728))
- */
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (prev)
+ prev->next = skb;
+ else
+ fq->fragments = skb;
- /* iif always set to one of the last arrived segment */
fq->dev = skb->dev;
fq->iif = skb->dev->ifindex;
-
- /* Last fragment */
- if ((fhdr->frag_off & __constant_htons(0x0001)) == 0)
- fq->last_in |= LAST_IN;
+ fq->meat += skb->len;
+ atomic_add(skb->truesize, &ip6_frag_mem);
/* First fragment.
nexthdr and nhptr are get from the first fragment.
@@ -391,85 +486,67 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
first one.
(fixed --ANK (980728))
*/
- if (nfp->offset == 0) {
+ if (offset == 0) {
fq->nexthdr = fhdr->nexthdr;
- fq->last_in |= FIRST_IN;
fq->nhoffset = nhptr - skb->nh.raw;
+ fq->last_in |= FIRST_IN;
}
-
- *bptr = nfp;
- nfp->next = fp;
return;
err:
- frag_kfree_s(nfp, sizeof(*nfp));
kfree_skb(skb);
}
/*
- * check if this fragment completes the packet
- * returns true on success
+ * Check if this packet is complete.
+ * Returns NULL on failure by any reason, and pointer
+ * to current nexthdr field in reassembled frame.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
*/
-static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in)
+static u8* ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in)
{
- struct ipv6_frag *fp;
- struct ipv6_frag *head = fq->fragments;
- struct ipv6_frag *tail = NULL;
+ struct sk_buff *fp, *head = fq->fragments;
struct sk_buff *skb;
- __u32 offset = 0;
- __u32 payload_len;
- __u16 unfrag_len;
- __u16 copy;
+ int payload_len;
+ int unfrag_len;
+ int copy;
u8 *nhptr;
- for(fp = head; fp; fp=fp->next) {
- if (offset != fp->offset)
- return NULL;
-
- offset += fp->len;
- tail = fp;
- }
-
/*
* we know the m_flag arrived and we have a queue,
* starting from 0, without gaps.
* this means we have all fragments.
*/
- /* Unfragmented part is taken from the first segment.
- (fixed --ANK (980728))
- */
- unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1);
+ fq_kill(fq);
- payload_len = (unfrag_len + tail->offset +
- (tail->skb->tail - (__u8 *) (tail->fhdr + 1)));
+ BUG_TRAP(head != NULL);
+ BUG_TRAP(FRAG6_CB(head)->offset == 0);
- if (payload_len > 65535) {
- if (net_ratelimit())
- printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len);
- IP6_INC_STATS_BH(Ip6ReasmFails);
- fq_free(fq);
- return NULL;
- }
+ /* Unfragmented part is taken from the first segment. */
+ unfrag_len = head->h.raw - (u8 *) (head->nh.ipv6h + 1);
+ payload_len = unfrag_len + fq->len;
- if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) {
- if (net_ratelimit())
- printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
- IP6_INC_STATS_BH(Ip6ReasmFails);
- fq_free(fq);
- return NULL;
- }
+ if (payload_len > 65535)
+ goto out_oversize;
+
+ if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL)
+ goto out_oom;
copy = unfrag_len + sizeof(struct ipv6hdr);
+ skb->mac.raw = skb->data;
skb->nh.ipv6h = (struct ipv6hdr *) skb->data;
skb->dev = fq->dev;
skb->protocol = __constant_htons(ETH_P_IPV6);
- skb->pkt_type = head->skb->pkt_type;
- memcpy(skb->cb, head->skb->cb, sizeof(skb->cb));
- skb->dst = dst_clone(head->skb->dst);
+ skb->pkt_type = head->pkt_type;
+ FRAG6_CB(skb)->h = FRAG6_CB(head)->h;
+ skb->dst = dst_clone(head->dst);
- memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy);
+ memcpy(skb_put(skb, copy), head->nh.ipv6h, copy);
nhptr = skb->nh.raw + fq->nhoffset;
*nhptr = fq->nexthdr;
@@ -479,29 +556,73 @@ static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in)
*skb_in = skb;
- /*
- * FIXME: If we don't have a checksum we ought to be able
- * to defragment and checksum in this pass. [AC]
- * Note that we don't really know yet whether the protocol
- * needs checksums at all. It might still be a good idea. -AK
- */
- for(fp = fq->fragments; fp; ) {
- struct ipv6_frag *back;
-
- memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len);
- frag_kfree_skb(fp->skb);
- back = fp;
- fp=fp->next;
- frag_kfree_s(back, sizeof(*back));
+ for (fp = fq->fragments; fp; fp=fp->next)
+ memcpy(skb_put(skb, fp->len), fp->data, fp->len);
+
+ IP6_INC_STATS_BH(Ip6ReasmOKs);
+ return nhptr;
+
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len);
+ goto out_fail;
+out_oom:
+ if (net_ratelimit())
+ printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
+out_fail:
+ IP6_INC_STATS_BH(Ip6ReasmFails);
+ return NULL;
+}
+
+u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr)
+{
+ struct sk_buff *skb = *skbp;
+ struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw);
+ struct frag_queue *fq;
+ struct ipv6hdr *hdr;
+
+ hdr = skb->nh.ipv6h;
+
+ IP6_INC_STATS_BH(Ip6ReasmReqds);
+
+ /* Jumbo payload inhibits frag. header */
+ if (hdr->payload_len==0) {
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+ return NULL;
+ }
+ if ((u8 *)(fhdr+1) > skb->tail) {
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+ return NULL;
}
- del_timer(&fq->timer);
- fq->prev->next = fq->next;
- fq->next->prev = fq->prev;
- fq->prev = fq->next = NULL;
+ if (!(fhdr->frag_off & __constant_htons(0xFFF9))) {
+ /* It is not a fragmented frame */
+ skb->h.raw += sizeof(struct frag_hdr);
+ IP6_INC_STATS_BH(Ip6ReasmOKs);
- frag_kfree_s(fq, sizeof(*fq));
+ return &fhdr->nexthdr;
+ }
- IP6_INC_STATS_BH(Ip6ReasmOKs);
- return nhptr;
+ if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
+ ip6_evictor();
+
+ if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr)) != NULL) {
+ u8 *ret = NULL;
+
+ spin_lock(&fq->lock);
+
+ ip6_frag_queue(fq, skb, fhdr, nhptr);
+
+ if (fq->last_in == (FIRST_IN|LAST_IN) &&
+ fq->meat == fq->len)
+ ret = ip6_frag_reasm(fq, skbp);
+
+ spin_unlock(&fq->lock);
+ fq_put(fq);
+ return ret;
+ }
+
+ IP6_INC_STATS_BH(Ip6ReasmFails);
+ kfree_skb(skb);
+ return NULL;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dc6020c33..dc5ddffd8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: route.c,v 1.45 2000/01/16 05:11:38 davem Exp $
+ * $Id: route.c,v 1.46 2000/07/07 22:40:35 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -769,10 +769,12 @@ int ip6_route_add(struct in6_rtmsg *rtmsg)
goto out;
if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
- rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway);
- err = -ENOMEM;
- if (rt->rt6i_nexthop == NULL)
+ rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
+ if (IS_ERR(rt->rt6i_nexthop)) {
+ err = PTR_ERR(rt->rt6i_nexthop);
+ rt->rt6i_nexthop = NULL;
goto out;
+ }
}
if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 114b59daa..c8a631f9f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.39 2000/07/07 01:55:20 davem Exp $
+ * $Id: sit.c,v 1.41 2000/07/07 23:47:45 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -45,6 +45,7 @@
#include <net/udp.h>
#include <net/icmp.h>
#include <net/ipip.h>
+#include <net/inet_ecn.h>
/*
This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
@@ -59,7 +60,7 @@ static int ipip6_fb_tunnel_init(struct net_device *dev);
static int ipip6_tunnel_init(struct net_device *dev);
static struct net_device ipip6_fb_tunnel_dev = {
- "", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init,
+ "sit0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init,
};
static struct ip_tunnel ipip6_fb_tunnel = {
@@ -174,10 +175,10 @@ struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create)
dev->priv = (void*)(dev+1);
nt = (struct ip_tunnel*)dev->priv;
nt->dev = dev;
- strcpy(dev->name, nt->parms.name);
dev->init = ipip6_tunnel_init;
dev->new_style = 1;
memcpy(&nt->parms, parms, sizeof(*parms));
+ strcpy(dev->name, nt->parms.name);
if (dev->name[0] == 0) {
int i;
for (i=1; i<100; i++) {
@@ -370,6 +371,13 @@ out:
#endif
}
+static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+ if (INET_ECN_is_ce(iph->tos) &&
+ INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h)))
+ IP6_ECN_set_ce(skb->nh.ipv6h);
+}
+
int ipip6_rcv(struct sk_buff *skb, unsigned short len)
{
struct iphdr *iph;
@@ -394,6 +402,7 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len)
nf_conntrack_put(skb->nfct);
skb->nfct = NULL;
#endif
+ ipip6_ecn_decapsulate(iph, skb);
netif_rx(skb);
read_unlock(&ipip6_lock);
return 0;
@@ -431,6 +440,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
int mtu;
struct in6_addr *addr6;
int addr_type;
+ int err;
if (tunnel->recursion++) {
tunnel->stat.collisions++;
@@ -548,7 +558,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->frag_off = 0;
iph->protocol = IPPROTO_IPV6;
- iph->tos = tos;
+ iph->tos = INET_ECN_encapsulate(tos, ip6_get_dsfield(iph6));
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
@@ -564,10 +574,17 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
skb->nfct = NULL;
#endif
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
+ if(err < 0) {
+ if(net_ratelimit())
+ printk(KERN_ERR "ipip6_tunnel_xmit: ip_send() failed, err=%d\n", -err);
+ skb = NULL;
+ goto tx_error;
+ }
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- do_ip_send);
tunnel->recursion--;
return 0;
@@ -576,7 +593,8 @@ tx_error_icmp:
dst_link_failure(skb);
tx_error:
stats->tx_errors++;
- dev_kfree_skb(skb);
+ if(skb)
+ dev_kfree_skb(skb);
tunnel->recursion--;
return 0;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4b3bf084b..f9f0c0dc9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -7,7 +7,7 @@
*
* Based on linux/ipv4/udp.c
*
- * $Id: udp.c,v 1.53 2000/05/03 06:37:07 davem Exp $
+ * $Id: udp.c,v 1.55 2000/07/08 00:20:43 davem Exp $
*
* Fixes:
* Hideaki YOSHIFUJI : sin6_scope_id support
@@ -400,7 +400,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
if (err)
goto out_free;
- sk->stamp=skb->stamp;
+ sock_recv_timestamp(msg, sk, skb);
/* Copy the address. */
if (msg->msg_name) {
@@ -868,6 +868,8 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
fl.proto = IPPROTO_UDP;
fl.fl6_dst = daddr;
+ if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr))
+ fl.fl6_src = &np->saddr;
fl.uli_u.ports.dport = udh.uh.dest;
fl.uli_u.ports.sport = udh.uh.source;
diff --git a/net/netsyms.c b/net/netsyms.c
index e1bfc3403..91c7a1074 100644
--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -69,17 +69,6 @@ extern int netdev_finish_unregister(struct net_device *dev);
#include <linux/rtnetlink.h>
-#if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \
- defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \
- defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \
- defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) || \
- defined(CONFIG_ES3210) || defined(CONFIG_ULTRA32) || \
- defined(CONFIG_LNE390) || defined(CONFIG_NE3210) || \
- defined(CONFIG_NE2K_PCI) || defined(CONFIG_APNE) || \
- defined(CONFIG_DAYNAPORT)
-#include "../drivers/net/8390.h"
-#endif
-
#ifdef CONFIG_IPX_MODULE
extern struct datalink_proto *make_EII_client(void);
extern struct datalink_proto *make_8023_client(void);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2955a04f6..3b2df4f55 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -5,7 +5,7 @@
*
* PACKET - implements raw packet sockets.
*
- * Version: $Id: af_packet.c,v 1.34 2000/04/25 04:13:35 davem Exp $
+ * Version: $Id: af_packet.c,v 1.36 2000/07/08 00:20:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -1053,7 +1053,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len,
err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
if (err)
goto out_free;
- sk->stamp=skb->stamp;
+
+ sock_recv_timestamp(msg, sk, skb);
if (msg->msg_name)
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -1392,6 +1393,23 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
switch(cmd)
{
+ case SIOCOUTQ:
+ {
+ int amount = atomic_read(&sk->wmem_alloc);
+ return put_user(amount, (int *)arg);
+ }
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->receive_queue.lock);
+ skb = skb_peek(&sk->receive_queue);
+ if (skb)
+ amount = skb->len;
+ spin_unlock_bh(&sk->receive_queue.lock);
+ return put_user(amount, (int *)arg);
+ }
case FIOSETOWN:
case SIOCSPGRP:
err = get_user(pid, (int *) arg);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d8c117247..1a4a501c9 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -39,6 +39,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
#define RED_ECN_ECT 0x02
#define RED_ECN_CE 0x01
@@ -170,14 +171,9 @@ static int red_ecn_mark(struct sk_buff *skb)
if (!(tos & RED_ECN_ECT))
return 0;
- if (!(tos & RED_ECN_CE)) {
- u32 check = skb->nh.iph->check;
+ if (!(tos & RED_ECN_CE))
+ IP_ECN_set_ce(skb->nh.iph);
- check += __constant_htons(0xFFFE);
- skb->nh.iph->check = check + (check>>16);
-
- skb->nh.iph->tos = tos | RED_ECN_CE;
- }
return 1;
}
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 7ea61ce5c..1b7119ffd 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -242,9 +242,9 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
atomic_inc(&n->refcnt);
} else {
- n = __neigh_lookup(mn->tbl, mn->primary_key, dev, 1);
- if (n == NULL)
- return -ENOBUFS;
+ n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
}
if (neigh_event_send(n, skb_res) == 0) {
int err;