summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1998-03-18 17:17:51 +0000
committerRalf Baechle <ralf@linux-mips.org>1998-03-18 17:17:51 +0000
commitf1382dc4850bb459d24a81c6cb0ef93ea7bd4a79 (patch)
tree225271a3d5dcd4e9dea5ee393556abd754c964b1 /net
parent135b00fc2e90e605ac2a96b20b0ebd93851a3f89 (diff)
o Merge with Linux 2.1.90.
o Divide L1 cache sizes by 1024 before printing, makes the numbers a bit more credible ...
Diffstat (limited to 'net')
-rw-r--r--net/802/sysctl_net_802.c1
-rw-r--r--net/802/tr.c19
-rw-r--r--net/appletalk/ddp.c59
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/core/dev.c83
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/iovec.c38
-rw-r--r--net/core/neighbour.c29
-rw-r--r--net/core/sock.c178
-rw-r--r--net/core/sysctl_net_core.c4
-rw-r--r--net/ipv4/af_inet.c15
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/devinet.c38
-rw-r--r--net/ipv4/fib_frontend.c24
-rw-r--r--net/ipv4/fib_hash.c2
-rw-r--r--net/ipv4/fib_rules.c2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/icmp.c8
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_fw.c35
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_masq_mod.c10
-rw-r--r--net/ipv4/ip_masq_raudio.c2
-rw-r--r--net/ipv4/ip_nat_dumb.c34
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c16
-rw-r--r--net/ipv4/proc.c58
-rw-r--r--net/ipv4/rarp.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c348
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c15
-rw-r--r--net/ipv4/tcp.c440
-rw-r--r--net/ipv4/tcp_input.c852
-rw-r--r--net/ipv4/tcp_ipv4.c784
-rw-r--r--net/ipv4/tcp_output.c449
-rw-r--r--net/ipv4/tcp_timer.c75
-rw-r--r--net/ipv4/timer.c63
-rw-r--r--net/ipv4/udp.c16
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--net/ipv6/exthdrs.c2
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/ip6_fw.c2
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ipv6_sockglue.c2
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/proc.c59
-rw-r--r--net/ipv6/raw.c2
-rw-r--r--net/ipv6/reassembly.c2
-rw-r--r--net/ipv6/route.c43
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/tcp_ipv6.c423
-rw-r--r--net/ipv6/udp.c39
-rw-r--r--net/ipx/af_ipx.c2
-rw-r--r--net/netbeui/af_netbeui.c2
-rw-r--r--net/netlink/af_netlink.c2
-rw-r--r--net/netrom/af_netrom.c4
-rw-r--r--net/netsyms.c10
-rw-r--r--net/packet/af_packet.c10
-rw-r--r--net/rose/af_rose.c4
-rw-r--r--net/socket.c10
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/x25/af_x25.c2
72 files changed, 2278 insertions, 2110 deletions
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index f97141d3c..19cd47af5 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -23,5 +23,6 @@ extern int sysctl_tr_rif_timeout;
ctl_table tr_table[] = {
{NET_TR_RIF_TIMEOUT, "rif_timeout", &sysctl_tr_rif_timeout, sizeof(int),
0644, NULL, &proc_dointvec},
+ {0}
};
#endif
diff --git a/net/802/tr.c b/net/802/tr.c
index bf6cd83d7..3550b81ed 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -510,10 +510,18 @@ int rif_get_info(char *buffer,char **start, off_t offset, int length, int dummy)
* Called during bootup. We don't actually have to initialise
* too much for this.
*/
-
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry tr_rif_proc = {
+ PROC_NET_TR_RIF, 6, "tr_rif",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ rif_get_info
+};
+#endif
+
__initfunc(void rif_init(struct net_proto *unused))
{
-
rif_timer.expires = RIF_TIMEOUT;
rif_timer.data = 0L;
rif_timer.function = rif_check_expire;
@@ -521,11 +529,6 @@ __initfunc(void rif_init(struct net_proto *unused))
add_timer(&rif_timer);
#ifdef CONFIG_PROC_FS
- proc_net_register(&(struct proc_dir_entry) {
- PROC_NET_TR_RIF, 6, "tr_rif",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- rif_get_info
- });
+ proc_net_register(&tr_rif_proc);
#endif
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 8b724361d..c56adc148 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -990,7 +990,7 @@ static int atalk_create(struct socket *sock, int protocol)
{
struct sock *sk;
- sk = sk_alloc(AF_APPLETALK, GFP_KERNEL);
+ sk = sk_alloc(AF_APPLETALK, GFP_KERNEL, 1);
if(sk == NULL)
return (-ENOMEM);
@@ -1404,6 +1404,31 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type
return (0);
}
+#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+ /*
+ * Check if IP-over-DDP
+ */
+ if(skb->data[12] == 22)
+ {
+ struct device *dev;
+
+ /* This needs to be able to handle ipddp"N" devices */
+ if((dev = dev_get("ipddp0")) == NULL)
+ return (-ENODEV);
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_pull(skb, 13);
+ skb->dev = dev;
+ skb->h.raw = skb->data;
+
+ ((struct net_device_stats *)dev->priv)->rx_packets++;
+ ((struct net_device_stats *)dev->priv)->rx_bytes += skb->len+13;
+ netif_rx(skb); /* Send the SKB up to a higher place. */
+
+ return (0);
+ }
+#endif
+
/*
* Which socket - atalk_search_socket() looks for a *full match*
* of the <net,node,port> tuple.
@@ -1420,38 +1445,6 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type
return (0);
}
-#ifdef CONFIG_IPDDP
- /*
- * Check if IP-over-DDP
- */
- if(skb->data[12] == 22)
- {
- struct device *dev;
- struct net_device_stats *estats;
-
- if((dev = dev_get("ipddp0")) == NULL)
- return (-ENODEV);
-
- estats = (struct net_device_stats *) dev->priv;
- skb->protocol = htons(ETH_P_IP);
- skb_pull(skb, 13);
- skb->dev = dev;
- skb->h.raw = skb->data;
- skb->nh.raw = skb->data;
-
- /* printk("passing up ipddp, 0x%02x better be 45\n",skb->data[0]);
- * printk("tot_len %d, skb->len %d\n",
- * ntohs(skb->h.iph->tot_len),skb->len);
- */
-
- estats->rx_packets++;
- estats->rx_bytes += skb->len + 13;
- netif_rx(skb); /* Send the SKB up to a higher place. */
-
- return (0);
- }
-#endif /* CONFIG_IPDDP */
-
/*
* Queue packet (standard)
*/
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 3a4196b3f..107f481d6 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -828,7 +828,7 @@ int ax25_create(struct socket *sock, int protocol)
return -ESOCKTNOSUPPORT;
}
- if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL)
+ if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL)
return -ENOMEM;
if ((ax25 = ax25_create_cb()) == NULL) {
@@ -854,7 +854,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
struct sock *sk;
ax25_cb *ax25;
- if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL)
+ if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL)
return NULL;
if ((ax25 = ax25_create_cb()) == NULL) {
@@ -1237,6 +1237,8 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
newsk = skb->sk;
newsk->pair = NULL;
+ newsk->socket = newsock;
+ newsk->sleep = &newsock->wait;
sti();
/* Now attach up the new socket */
diff --git a/net/core/dev.c b/net/core/dev.c
index b06d0053e..36efa363b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -50,6 +50,7 @@
* is no device open function.
* Andi Kleen : Fix error reporting for SIOCGIFCONF
* Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
+ * Cyrus Durgin : Cleaned for KMOD
*
*/
@@ -81,7 +82,7 @@
#include <net/pkt_sched.h>
#include <net/profile.h>
#include <linux/init.h>
-#include <linux/kerneld.h>
+#include <linux/kmod.h>
#ifdef CONFIG_NET_RADIO
#include <linux/wireless.h>
#endif /* CONFIG_NET_RADIO */
@@ -316,7 +317,7 @@ struct device *dev_alloc(const char *name, int *err)
* Find and possibly load an interface.
*/
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
void dev_load(const char *name)
{
@@ -398,20 +399,24 @@ int dev_open(struct device *dev)
}
#ifdef CONFIG_NET_FASTROUTE
-void dev_clear_fastroute(struct device *dev)
+
+static __inline__ void dev_do_clear_fastroute(struct device *dev)
{
- int i;
+ if (dev->accept_fastpath) {
+ int i;
- if (dev) {
for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++)
dst_release(xchg(dev->fastpath+i, NULL));
+ }
+}
+
+void dev_clear_fastroute(struct device *dev)
+{
+ if (dev) {
+ dev_do_clear_fastroute(dev);
} else {
- for (dev = dev_base; dev; dev = dev->next) {
- if (dev->accept_fastpath) {
- for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++)
- dst_release(xchg(dev->fastpath+i, NULL));
- }
- }
+ for (dev = dev_base; dev; dev = dev->next)
+ dev_do_clear_fastroute(dev);
}
}
#endif
@@ -643,7 +648,7 @@ int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev))
set_bit(bit, &netdev_fc_mask);
clear_bit(bit, &netdev_fc_xoff);
}
- sti();
+ restore_flags(flags);
return bit;
}
@@ -659,7 +664,7 @@ void netdev_unregister_fc(int bit)
clear_bit(bit, &netdev_fc_mask);
clear_bit(bit, &netdev_fc_xoff);
}
- sti();
+ restore_flags(flags);
}
static void netdev_wakeup(void)
@@ -978,39 +983,6 @@ int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
/*
- This ioctl is wrong by design. It really existed in some
- old SYSV systems, only was named SIOCGIFNUM.
- In multiprotocol environment it is just useless.
- Well, SIOCGIFCONF is wrong too, but we have to preserve
- it by compatibility reasons.
-
- If someone wants to achieve the same effect, please, use undocumented
- feature of SIOCGIFCONF: it returns buffer length, if buffer
- is not supplied.
-
- Let's remove it, until someone started to use it. --ANK
-
- In any case, if someone cannot live without it, it should
- be renamed to SIOCGIFNUM.
- */
-
-
-/*
- * Count the installed interfaces (SIOCGIFCOUNT)
- */
-
-static int dev_ifcount(unsigned int *arg)
-{
- struct device *dev;
- unsigned int count = 0;
-
- for (dev = dev_base; dev != NULL; dev = dev->next)
- count++;
-
- return put_user(count, arg);
-}
-
-/*
* Map an interface index to its name (SIOCGIFNAME)
*/
@@ -1022,6 +994,11 @@ static int dev_ifcount(unsigned int *arg)
* Besides that, it is pretty silly to put "drawing" facility
* to kernel, it is useful only to print ifindices
* in readable form, is not it? --ANK
+ *
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
*/
static int dev_ifname(struct ifreq *arg)
@@ -1120,20 +1097,21 @@ static int sprintf_stats(char *buffer, struct device *dev)
int size;
if (stats)
- size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n",
- dev->name,
+ size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n",
+ dev->name,
stats->rx_bytes,
stats->rx_packets, stats->rx_errors,
stats->rx_dropped + stats->rx_missed_errors,
stats->rx_fifo_errors,
stats->rx_length_errors + stats->rx_over_errors
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
stats->tx_bytes,
stats->tx_packets, stats->tx_errors, stats->tx_dropped,
stats->tx_fifo_errors, stats->collisions,
stats->tx_carrier_errors + stats->tx_aborted_errors
+ stats->tx_window_errors + stats->tx_heartbeat_errors,
- stats->multicast);
+ stats->tx_compressed);
else
size = sprintf(buffer, "%6s: No statistics available.\n", dev->name);
@@ -1156,8 +1134,8 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy
size = sprintf(buffer,
- "Inter-| Receive | Transmit\n"
- " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier multicast\n");
+ "Inter-| Receive | Transmit\n"
+ " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n");
pos+=size;
len+=size;
@@ -1555,9 +1533,6 @@ int dev_ioctl(unsigned int cmd, void *arg)
rtnl_shunlock();
return ret;
}
- if (cmd == SIOCGIFCOUNT) {
- return dev_ifcount((unsigned int*)arg);
- }
if (cmd == SIOCGIFNAME) {
return dev_ifname((struct ifreq *)arg);
}
diff --git a/net/core/dst.c b/net/core/dst.c
index e94ef2967..4cad680c2 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -101,7 +101,7 @@ void * dst_alloc(int size, struct dst_ops * ops)
void __dst_free(struct dst_entry * dst)
{
start_bh_atomic();
- dst->obsolete = 1;
+ dst->obsolete = 2;
dst->next = dst_garbage_list;
dst_garbage_list = dst;
if (dst_gc_timer_inc > DST_GC_INC) {
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 18a9a3b5b..9e8873646 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -80,18 +80,21 @@ out_free:
/*
* Copy kernel to iovec.
+ *
+ * Note: this modifies the original iovec.
*/
int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
{
- int err = -EFAULT;
+ int err;
while(len>0)
{
if(iov->iov_len)
{
int copy = min(iov->iov_len, len);
- if (copy_to_user(iov->iov_base, kdata, copy))
+ err = copy_to_user(iov->iov_base, kdata, copy);
+ if (err)
goto out;
kdata+=copy;
len-=copy;
@@ -107,6 +110,8 @@ out:
/*
* Copy iovec to kernel.
+ *
+ * Note: this modifies the original iovec.
*/
int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
@@ -187,9 +192,8 @@ out:
* call to this function will be unaligned also.
*/
-int csum_partial_copy_fromiovecend(unsigned char *kdata,
- struct iovec *iov, int offset,
- unsigned int len, int *csump)
+int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
+ int offset, unsigned int len, int *csump)
{
int partial_cnt = 0;
int err = 0;
@@ -246,9 +250,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata,
if (copy_from_user(kdata, base, copy))
goto out_fault;
kdata += copy;
- base += copy;
+ base += copy;
partial_cnt += copy;
- len -= copy;
+ len -= copy;
iov++;
if (len)
continue;
@@ -260,9 +264,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata,
goto out_fault;
csum = csum_partial(kdata - partial_cnt, 4, csum);
kdata += par_len;
- base += par_len;
- copy -= par_len;
- len -= par_len;
+ base += par_len;
+ copy -= par_len;
+ len -= par_len;
partial_cnt = 0;
}
@@ -278,16 +282,12 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata,
}
}
- /* Why do we want to break?? There may be more to copy ... */
- if (copy == 0) {
-if (len > partial_cnt)
-printk("csum_iovec: early break? len=%d, partial=%d\n", len, partial_cnt);
- break;
+ if (copy) {
+ csum = csum_and_copy_from_user(base, kdata, copy,
+ csum, &err);
+ if (err)
+ goto out;
}
-
- csum = csum_and_copy_from_user(base, kdata, copy, csum, &err);
- if (err)
- goto out;
len -= copy + partial_cnt;
kdata += copy + partial_cnt;
iov++;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3de3743e0..a8d72604d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -153,12 +153,14 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev)
static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat)
{
struct neighbour *n;
+ unsigned long now = jiffies;
if (tbl->entries > tbl->gc_thresh1) {
if (creat < 0)
return NULL;
- if (tbl->entries > tbl->gc_thresh2 ||
- jiffies - tbl->last_flush > 5*HZ) {
+ if (tbl->entries > tbl->gc_thresh3 ||
+ (tbl->entries > tbl->gc_thresh2 &&
+ now - tbl->last_flush > 5*HZ)) {
if (neigh_forced_gc(tbl) == 0 &&
tbl->entries > tbl->gc_thresh3)
return NULL;
@@ -172,7 +174,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat)
memset(n, 0, tbl->entry_size);
skb_queue_head_init(&n->arp_queue);
- n->updated = n->used = jiffies;
+ n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
n->parms = &tbl->parms;
@@ -666,8 +668,18 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int
neigh_suspect(neigh);
if (!(old&NUD_VALID)) {
struct sk_buff *skb;
- while ((skb=__skb_dequeue(&neigh->arp_queue)) != NULL)
- neigh->output(skb);
+
+ /* Again: avoid dead loop if something went wrong */
+
+ while (neigh->nud_state&NUD_VALID &&
+ (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct neighbour *n1 = neigh;
+ /* On shaper/eql skb->dst->neighbour != neigh :( */
+ if (skb->dst && skb->dst->neighbour)
+ n1 = skb->dst->neighbour;
+ n1->output(skb);
+ }
+ skb_queue_purge(&neigh->arp_queue);
}
return 0;
}
@@ -1228,7 +1240,7 @@ struct neigh_sysctl_table
&proc_dointvec},
{0}},
- {{1, "default", NULL, 0, 0555, NULL},{0}},
+ {{NET_PROTO_CONF_DEFAULT, "default", NULL, 0, 0555, NULL},{0}},
{{0, "neigh", NULL, 0, 0555, NULL},{0}},
{{0, NULL, NULL, 0, 0555, NULL},{0}},
{{CTL_NET, "net", NULL, 0, 0555, NULL},{0}}
@@ -1243,10 +1255,11 @@ int neigh_sysctl_register(struct device *dev, struct neigh_parms *p,
if (t == NULL)
return -ENOBUFS;
memcpy(t, &neigh_sysctl_template, sizeof(*t));
+ t->neigh_vars[0].data = &p->mcast_probes;
t->neigh_vars[1].data = &p->ucast_probes;
t->neigh_vars[2].data = &p->app_probes;
t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->reachable_time;
+ t->neigh_vars[4].data = &p->base_reachable_time;
t->neigh_vars[5].data = &p->delay_probe_time;
t->neigh_vars[6].data = &p->gc_staletime;
t->neigh_vars[7].data = &p->queue_len;
@@ -1256,7 +1269,7 @@ int neigh_sysctl_register(struct device *dev, struct neigh_parms *p,
t->neigh_vars[11].data = &p->locktime;
if (dev) {
t->neigh_dev[0].procname = dev->name;
- t->neigh_dev[0].ctl_name = dev->ifindex+1;
+ t->neigh_dev[0].ctl_name = dev->ifindex;
memset(&t->neigh_vars[12], 0, sizeof(ctl_table));
} else {
t->neigh_vars[12].data = (&p->locktime) + 1;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6da5f5a0d..f940e5a80 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -137,6 +137,8 @@ __u32 sysctl_wmem_default = SK_WMEM_MAX;
__u32 sysctl_rmem_default = SK_RMEM_MAX;
int sysctl_core_destroy_delay = SOCK_DESTROY_TIME;
+/* Maximal space eaten by iovec (still not made (2.1.88)!) plus some space */
+int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
/*
* This is meant for all protocols to use and covers goings on
@@ -472,11 +474,11 @@ static kmem_cache_t *sk_cachep;
* usage.
*/
-struct sock *sk_alloc(int family, int priority)
+struct sock *sk_alloc(int family, int priority, int zero_it)
{
struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
- if(sk) {
+ if(sk && zero_it) {
memset(sk, 0, sizeof(struct sock));
sk->family = family;
}
@@ -561,34 +563,22 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
void *sock_kmalloc(struct sock *sk, int size, int priority)
{
void *mem = NULL;
- /* Always use wmem.. */
- if (atomic_read(&sk->wmem_alloc)+size < sk->sndbuf) {
+ if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
/* First do the add, to avoid the race if kmalloc
* might sleep.
*/
- atomic_add(size, &sk->wmem_alloc);
+ atomic_add(size, &sk->omem_alloc);
mem = kmalloc(size, priority);
- if (mem)
- return mem;
- atomic_sub(size, &sk->wmem_alloc);
}
return mem;
}
void sock_kfree_s(struct sock *sk, void *mem, int size)
{
-#if 1 /* Debug */
- if (atomic_read(&sk->wmem_alloc) < size) {
- printk(KERN_DEBUG "sock_kfree_s: mem not accounted.\n");
- return;
- }
-#endif
kfree_s(mem, size);
- atomic_sub(size, &sk->wmem_alloc);
- sk->write_space(sk);
+ atomic_sub(size, &sk->omem_alloc);
}
-
/* FIXME: this is insane. We are trying suppose to be controlling how
* how much space we have for data bytes, not packet headers.
* This really points out that we need a better system for doing the
@@ -633,6 +623,30 @@ unsigned long sock_wspace(struct sock *sk)
return(0);
}
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+ I think, these locks should be removed for datagram sockets.
+ */
+static void sock_wait_for_wmem(struct sock * sk)
+{
+ struct wait_queue wait = { current, NULL };
+
+ sk->socket->flags &= ~SO_NOSPACE;
+ add_wait_queue(sk->sleep, &wait);
+ for (;;) {
+ if (signal_pending(current))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
+ break;
+ if (sk->shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->err)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+}
/*
@@ -641,94 +655,78 @@ unsigned long sock_wspace(struct sock *sk)
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigned long fallback, int noblock, int *errcode)
{
+ int err;
struct sk_buff *skb;
- do
- {
- if(sk->err!=0)
- {
- *errcode=xchg(&sk->err,0);
- return NULL;
- }
-
- if(sk->shutdown&SEND_SHUTDOWN)
- {
- /*
- * FIXME: Check 1003.1g should we deliver
- * a signal here ???
- */
- *errcode=-EPIPE;
- return NULL;
- }
-
- if(!fallback)
+ do {
+ if ((err = xchg(&sk->err,0)) != 0)
+ goto failure;
+
+ /*
+ * FIXME: Check 1003.1g should we deliver
+ * a signal here ???
+ *
+ * Alan, could we solve this question once and forever?
+ *
+ * I believe, datagram sockets should never
+ * generate SIGPIPE. Moreover, I DO think that
+ * TCP is allowed to generate it only on write()
+ * call, but never on send/sendto/sendmsg.
+ * (btw, Solaris generates it even on read() :-))
+ *
+ * The reason is that SIGPIPE is global flag,
+ * so that library function using sockets (f.e. syslog()),
+ * must save/disable it on entry and restore on exit.
+ * As result, signal arriving for another thread will
+ * be lost. Generation it on write() is still necessary
+ * because a lot of stupid programs never check write()
+ * return value.
+ *
+ * Seems, SIGPIPE is very bad idea, sort of gets().
+ * At least, we could have an option disabling
+ * this behaviour on per-socket and/or per-message base.
+ * BTW it is very easy - MSG_SIGPIPE flag, which
+ * always set by read/write and checked here.
+ * --ANK
+ */
+
+ err = -EPIPE;
+ if (sk->shutdown&SEND_SHUTDOWN)
+ goto failure;
+
+ if (!fallback)
skb = sock_wmalloc(sk, size, 0, sk->allocation);
- else
- {
+ else {
/* The buffer get won't block, or use the atomic queue. It does
produce annoying no free page messages still.... */
skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER);
- if(!skb)
+ if (!skb)
skb=sock_wmalloc(sk, fallback, 0, sk->allocation);
}
-
+
/*
* This means we have too many buffers for this socket already.
*/
-
- if(skb==NULL)
- {
- unsigned long tmp;
+ /* The following code is stolen "as is" from tcp.c */
+
+ if (skb==NULL) {
sk->socket->flags |= SO_NOSPACE;
- if(noblock)
- {
- *errcode=-EAGAIN;
- return NULL;
- }
- if(sk->shutdown&SEND_SHUTDOWN)
- {
- *errcode=-EPIPE;
- return NULL;
- }
- tmp = atomic_read(&sk->wmem_alloc);
- cli();
- if(sk->shutdown&SEND_SHUTDOWN)
- {
- sti();
- *errcode=-EPIPE;
- return NULL;
- }
-
-#if 1
- if( tmp <= atomic_read(&sk->wmem_alloc))
-#else
- /* ANK: Line above seems either incorrect
- * or useless. sk->wmem_alloc has a tiny chance to change
- * between tmp = sk->w... and cli(),
- * but it might(?) change earlier. In real life
- * it does not (I never seen the message).
- * In any case I'd delete this check at all, or
- * change it to:
- */
- if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf)
-#endif
- {
- sk->socket->flags &= ~SO_NOSPACE;
- interruptible_sleep_on(sk->sleep);
- if (signal_pending(current))
- {
- sti();
- *errcode = -ERESTARTSYS;
- return NULL;
- }
- }
- sti();
+ err = -EAGAIN;
+ if (noblock)
+ goto failure;
+ err = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto failure;
+ sock_wait_for_wmem(sk);
}
- }
- while(skb==NULL);
-
+ } while (skb==NULL);
+
return skb;
+
+failure:
+ *errcode = err;
+ return NULL;
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 1da2cc152..47c85d006 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,6 +22,7 @@ extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
extern int sysctl_core_destroy_delay;
+extern int sysctl_optmem_max;
ctl_table core_table[] = {
{NET_CORE_WMEM_MAX, "wmem_max",
@@ -53,6 +54,9 @@ ctl_table core_table[] = {
{NET_CORE_MSG_BURST, "message_burst",
&net_msg_burst, sizeof(int), 0644, NULL,
&proc_dointvec_jiffies},
+ {NET_CORE_OPTMEM_MAX, "optmem_max",
+ &sysctl_optmem_max, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{ 0 }
};
#endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 584ad8c7a..ef1c44620 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
*
* AF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.5 1997/12/16 05:37:33 ralf Exp $
+ * Version: $Id: af_inet.c,v 1.6 1998/03/17 22:18:20 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -52,6 +52,7 @@
* Willy Konynenberg : Transparent proxying support.
* David S. Miller : New socket lookup architecture.
* Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -107,8 +108,8 @@
#ifdef CONFIG_BRIDGE
#include <net/br.h>
#endif
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
#endif
#ifdef CONFIG_NET_RADIO
#include <linux/wireless.h>
@@ -327,7 +328,7 @@ static int inet_create(struct socket *sock, int protocol)
static int warned;
if (net_families[AF_PACKET]==NULL)
{
-#if defined(CONFIG_KERNELD) && defined(CONFIG_PACKET_MODULE)
+#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE)
char module_name[30];
sprintf(module_name,"net-pf-%d", AF_PACKET);
request_module(module_name);
@@ -341,7 +342,7 @@ static int inet_create(struct socket *sock, int protocol)
}
sock->state = SS_UNCONNECTED;
- sk = sk_alloc(AF_INET, GFP_KERNEL);
+ sk = sk_alloc(AF_INET, GFP_KERNEL, 1);
if (sk == NULL)
goto do_oom;
@@ -894,7 +895,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCDRARP:
case SIOCGRARP:
case SIOCSRARP:
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
if (rarp_ioctl_hook == NULL)
request_module("rarp");
#endif
@@ -928,7 +929,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_DLCI_MODULE
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
if (dlci_ioctl_hook == NULL)
request_module("dlci");
#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 94ae4263e..dd7ce9e0f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.4 1998/03/03 01:23:36 ralf Exp $
+ * Version: $Id: arp.c,v 1.5 1998/03/17 22:18:21 ralf Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -189,7 +189,7 @@ struct neigh_table arp_tbl =
NULL,
parp_redo,
{ NULL, NULL, &arp_tbl, 0, NULL, NULL,
- 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 1*HZ, 64 },
+ 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },
30*HZ, 128, 512, 1024,
};
@@ -954,6 +954,10 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
struct device *dev = n->dev;
int hatype = dev->type;
+ /* Do not confuse users "arp -a" with magic entries */
+ if (!(n->nud_state&~NUD_NOARP))
+ continue;
+
/* I'd get great pleasure deleting
this ugly code. Let's output it in hexadecimal format.
"arp" utility will eventually repaired --ANK
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7d5f0021f..87394f906 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.3 1997/12/16 05:37:35 ralf Exp $
+ * Version: $Id: devinet.c,v 1.4 1998/03/17 22:18:21 ralf Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -19,6 +19,7 @@
*
* Changes:
* Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists.
+ Cyrus Durgin: updated for kmod
*/
#include <linux/config.h>
@@ -49,8 +50,8 @@
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
#endif
#include <net/ip.h>
@@ -157,28 +158,32 @@ static void
inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
{
struct in_ifaddr *ifa1 = *ifap;
- struct in_ifaddr *ifa;
-
- /* 1. Unlink it */
- *ifap = ifa1->ifa_next;
-
- /* 2. Deleting primary ifaddr forces deletion all secondaries */
+ /* 1. Deleting primary ifaddr forces deletion all secondaries */
if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) {
- while ((ifa=*ifap) != NULL) {
- if (ifa1->ifa_mask != ifa->ifa_mask ||
+ struct in_ifaddr *ifa;
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa=*ifap1) != NULL) {
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa)) {
- ifap = &ifa->ifa_next;
+ ifap1 = &ifa->ifa_next;
continue;
}
- *ifap = ifa->ifa_next;
+ *ifap1 = ifa->ifa_next;
rtmsg_ifa(RTM_DELADDR, ifa);
notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
inet_free_ifa(ifa);
}
}
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+
+
/* 3. Announce address deletion */
/* Send message first, then call notifier.
@@ -232,10 +237,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
ifap = last_primary;
}
- cli();
ifa->ifa_next = *ifap;
+ /* ATOMIC_SET */
*ifap = ifa;
- sti();
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
@@ -413,7 +417,7 @@ int devinet_ioctl(unsigned int cmd, void *arg)
*colon = 0;
#endif
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
dev_load(ifr.ifr_name);
#endif
@@ -960,6 +964,8 @@ static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devcon
t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
if (t->sysctl_header == NULL)
kfree(t);
+ else
+ p->sysctl = t;
}
static void devinet_sysctl_unregister(struct ipv4_devconf *p)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 409db8209..6350a6366 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.6 1997/12/13 21:52:48 kuznet Exp $
+ * Version: $Id: fib_frontend.c,v 1.9 1998/03/08 20:52:36 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -151,7 +151,6 @@ struct device * ip_dev_find(u32 addr)
memset(&key, 0, sizeof(key));
key.dst = addr;
- key.scope = RT_SCOPE_UNIVERSE;
if (!local_table || local_table->tb_lookup(local_table, &key, &res)
|| res.type != RTN_LOCAL)
@@ -344,6 +343,10 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
int s_t;
struct fib_table *tb;
+ if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+ return ip_rt_dump(skb, cb);
+
s_t = cb->args[0];
if (s_t == 0)
s_t = cb->args[0] = RT_TABLE_MIN;
@@ -423,8 +426,13 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa)
u32 addr = ifa->ifa_local;
u32 prefix = ifa->ifa_address&mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY)
+ if (ifa->ifa_flags&IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
@@ -435,7 +443,8 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY)) {
+ if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
@@ -464,8 +473,13 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
if (!(ifa->ifa_flags&IFA_F_SECONDARY))
fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
- else
+ else {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
/* Deletion is more complicated than add.
We should take care of not to delete too much :-)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 33bcf0321..4b89ab676 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- * Version: $Id: fib_hash.c,v 1.1 1997/11/09 19:53:13 kuznet Exp $
+ * Version: $Id: fib_hash.c,v 1.3 1998/03/08 05:56:16 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 3ffb404b5..7ec60a5be 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: policy rules.
*
- * Version: $Id: fib_rules.c,v 1.2 1997/10/10 22:40:49 davem Exp $
+ * Version: $Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3883fcba0..d2d37e11e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: semantics.
*
- * Version: $Id: fib_semantics.c,v 1.6 1997/12/13 21:52:49 kuznet Exp $
+ * Version: $Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b2c7151d1..e8f636e21 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -3,7 +3,7 @@
*
* Alan Cox, <alan@cymru.net>
*
- * Version: $Id: icmp.c,v 1.4 1998/03/03 01:23:37 ralf Exp $
+ * Version: $Id: icmp.c,v 1.5 1998/03/17 22:18:23 ralf Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -928,10 +928,8 @@ int icmp_chkaddr(struct sk_buff *skb)
struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
- if (!sk) return 0;
- if (sk->saddr != iph->saddr) return 0;
- if (sk->daddr != iph->daddr) return 0;
- if (sk->dummy_th.dest != th->dest) return 0;
+ if (!sk || (sk->state == TCP_LISTEN))
+ return 0;
/*
* This packet came from us.
*/
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 166b68b42..d3414a0fe 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.3 1997/12/16 05:37:36 ralf Exp $
+ * Version: $Id: igmp.c,v 1.4 1998/03/17 22:18:24 ralf Exp $
*
* Authors:
* Alan Cox <Alan.Cox@linux.org>
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 45a2ed588..8df8414cd 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -5,7 +5,7 @@
*
* The IP forwarding functionality.
*
- * Version: $Id: ip_forward.c,v 1.3 1998/03/03 01:23:37 ralf Exp $
+ * Version: $Id: ip_forward.c,v 1.4 1998/03/17 22:18:25 ralf Exp $
*
* Authors: see ip.c
*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dccb5324..e6831adb8 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
*
* The IP fragmentation functionality.
*
- * Version: $Id: ip_fragment.c,v 1.30 1997/12/29 19:52:32 kuznet Exp $
+ * Version: $Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox <Alan.Cox@linux.org>
diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c
index d78aa0f66..4eb41c325 100644
--- a/net/ipv4/ip_fw.c
+++ b/net/ipv4/ip_fw.c
@@ -6,7 +6,7 @@
* license in recognition of the original copyright.
* -- Alan Cox.
*
- * $Id: ip_fw.c,v 1.3 1997/12/16 05:37:37 ralf Exp $
+ * $Id: ip_fw.c,v 1.4 1998/03/17 22:18:25 ralf Exp $
*
* Ported from BSD to Linux,
* Alan Cox 22/Nov/1994.
@@ -392,6 +392,39 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_
continue; /* Mismatch */
}
+ /* This looks stupid, because we scan almost static
+ list, searching for static key. However, this way seems
+ to be only reasonable way of handling fw_via rules
+ (btw bsd makes the same thing).
+
+ It will not affect performance if you will follow
+ the following simple rules:
+
+ - if inteface is aliased, ALWAYS specify fw_viadev,
+ so that previous check will guarantee, that we will
+ not waste time when packet arrive on another interface.
+
+ - avoid using fw_via.s_addr if fw_via.s_addr is owned
+ by an aliased interface.
+
+ --ANK
+ */
+ if (f->fw_via.s_addr && rif) {
+ struct in_ifaddr *ifa;
+
+ if (rif->ip_ptr == NULL)
+ continue; /* Mismatch */
+
+ for (ifa = ((struct in_device*)(rif->ip_ptr))->ifa_list;
+ ifa; ifa = ifa->ifa_next) {
+ if (ifa->ifa_local == f->fw_via.s_addr)
+ goto ifa_ok;
+ }
+ continue; /* Mismatch */
+
+ ifa_ok:
+ }
+
/*
* Ok the chain addresses match.
*/
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 61c364542..fa8208959 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.2 1997/12/16 05:37:38 ralf Exp $
+ * Version: $Id: ip_input.c,v 1.3 1998/03/17 22:18:26 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c
index 797f9112f..2265161f3 100644
--- a/net/ipv4/ip_masq_mod.c
+++ b/net/ipv4/ip_masq_mod.c
@@ -12,6 +12,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
+ * Changes:
+ * Cyrus Durgin: fixed kerneld stuff for kmod.
*/
#include <linux/config.h>
@@ -21,8 +23,8 @@
#include <linux/errno.h>
#include <net/ip_masq.h>
#include <net/ip_masq_mod.h>
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
#endif
EXPORT_SYMBOL(register_ip_masq_mod);
@@ -290,7 +292,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name)
int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen)
{
struct ip_masq_mod * mmod;
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
char kmod_name[IP_MASQ_MOD_NMAX+8];
#endif
/* tappo */
@@ -299,7 +301,7 @@ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen)
mmod = ip_masq_mod_getbyname(mctl->u.mod.name);
if (mmod)
return mmod->mmod_ctl(optname, mctl, optlen);
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
sprintf(kmod_name,"ip_masq_%s", mctl->u.mod.name);
IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name);
diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c
index f7e28f21a..377b8223e 100644
--- a/net/ipv4/ip_masq_raudio.c
+++ b/net/ipv4/ip_masq_raudio.c
@@ -2,7 +2,7 @@
* IP_MASQ_RAUDIO - Real Audio masquerading module
*
*
- * Version: @(#)$Id: ip_masq_raudio.c,v 1.8 1997/11/28 15:32:32 alan Exp $
+ * Version: @(#)$Id: ip_masq_raudio.c,v 1.9 1998/02/23 02:50:19 davem Exp $
*
* Author: Nigel Metheringham
* Real Time Streaming code by Progressive Networks
diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c
index 06e9be8fb..def66858c 100644
--- a/net/ipv4/ip_nat_dumb.c
+++ b/net/ipv4/ip_nat_dumb.c
@@ -5,7 +5,7 @@
*
* Dumb Network Address Translation.
*
- * Version: $Id: ip_nat_dumb.c,v 1.2 1997/10/10 22:41:05 davem Exp $
+ * Version: $Id: ip_nat_dumb.c,v 1.2 1997/12/16 05:37:40 ralf Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -14,6 +14,9 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
+ * Fixes:
+ * Rani Assaf : A zero checksum is a special case
+ * only in UDP
*
* NOTE: It is just working model of real NAT.
*/
@@ -49,7 +52,6 @@ ip_do_nat(struct sk_buff *skb)
u32 odaddr = iph->daddr;
u32 osaddr = iph->saddr;
u16 check;
- u16 *cksum = NULL;
IPCB(skb)->flags |= IPSKB_TRANSLATED;
@@ -62,17 +64,23 @@ ip_do_nat(struct sk_buff *skb)
/* If it is the first fragment, rewrite protocol headers */
if (!(iph->frag_off & htons(IP_OFFSET))) {
- /* Only plain TCP/UDP headers rewriting is implemented :-( */
- if (iph->protocol == IPPROTO_TCP)
- cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
- else if (iph->protocol == IPPROTO_UDP)
- cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
- if (cksum && (check = *cksum) != 0) {
- check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
- check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
- if (!check)
- check = 0xFFFF;
- *cksum = check;
+ u16 *cksum;
+
+ switch(iph->protocol) {
+ case IPPROTO_TCP:
+ cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
+ check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum));
+ *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+ break;
+ case IPPROTO_UDP:
+ cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
+ if ((check = *cksum) != 0) {
+ check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
+ check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+ *cksum = check ? : 0xFFFF;
+ }
+ default:
+ break;
}
}
return 0;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 53c680eed..d78cc1ff0 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -5,7 +5,7 @@
*
* The options processing module for ip.c
*
- * Version: $Id: ip_options.c,v 1.2 1997/12/16 05:37:40 ralf Exp $
+ * Version: $Id: ip_options.c,v 1.3 1998/03/17 22:18:28 ralf Exp $
*
* Authors: A.N.Kuznetsov
*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ac4ac22ae..63fbbfe1e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.4 1998/03/03 01:23:41 ralf Exp $
+ * Version: $Id: ip_output.c,v 1.5 1998/03/17 22:18:29 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index a500a72e5..1b7f44e8f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
*
* The IP to API glue.
*
- * Version: $Id: ip_sockglue.c,v 1.4 1998/03/03 01:23:41 ralf Exp $
+ * Version: $Id: ip_sockglue.c,v 1.5 1998/03/17 22:18:29 ralf Exp $
*
* Authors: see ip.c
*
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 20521e643..1e44ae8aa 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,5 +1,5 @@
/*
- * $Id: ipconfig.c,v 1.6 1998/01/09 17:19:46 mj Exp $
+ * $Id: ipconfig.c,v 1.11 1998/02/12 07:43:16 davem Exp $
*
* Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
* information to configure own IP address and routes.
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 949661f41..ce071d406 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.4 1997/12/16 05:37:42 ralf Exp $
+ * Version: $Id: ipip.c,v 1.5 1998/03/17 22:18:30 ralf Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d3c07dca3..1177f33ac 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: ipmr.c,v 1.29 1997/12/13 21:52:55 kuznet Exp $
+ * Version: $Id: ipmr.c,v 1.4 1998/03/17 22:18:31 ralf Exp $
*
* Fixes:
* Michael Chastain : Incorrect size of copying.
@@ -1351,6 +1351,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
int ct;
struct rtnexthop *nhp;
struct device *dev = vif_table[c->mfc_parent].dev;
+ u8 *b = skb->tail;
#ifdef CONFIG_RTNL_OLD_IFINFO
if (dev) {
@@ -1389,10 +1390,11 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
return 1;
rtattr_failure:
+ skb_trim(skb, b - skb->data);
return -EMSGSIZE;
}
-int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
{
struct mfc_cache *cache;
struct rtable *rt = (struct rtable*)skb->dst;
@@ -1400,10 +1402,16 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
start_bh_atomic();
cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
- struct device *dev = skb->dev;
+ struct device *dev;
int vif;
int err;
+ if (nowait) {
+ end_bh_atomic();
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) {
end_bh_atomic();
return -ENODEV;
@@ -1422,7 +1430,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
*/
end_bh_atomic();
- if (rtm->rtm_flags & RTM_F_NOTIFY)
+ if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
return ipmr_fill_mroute(skb, cache, rtm);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7f3b5f9bb..221207205 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
* PROC file system. It is mainly used for debugging and
* statistics.
*
- * Version: $Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $
+ * Version: $Id: proc.c,v 1.4 1997/12/16 05:37:43 ralf Exp $
*
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -77,11 +77,12 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
unsigned long dest, src;
unsigned short destp, srcp;
int timer_active, timer_active1, timer_active2;
+ int tw_bucket = 0;
unsigned long timer_expires;
struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
dest = sp->daddr;
- src = sp->saddr;
+ src = sp->rcv_saddr;
destp = sp->dummy_th.dest;
srcp = sp->dummy_th.source;
@@ -96,30 +97,47 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
destp = ntohs(destp);
srcp = ntohs(srcp);
- timer_active1 = del_timer(&tp->retransmit_timer);
- timer_active2 = del_timer(&sp->timer);
- if (!timer_active1) tp->retransmit_timer.expires=0;
- if (!timer_active2) sp->timer.expires=0;
- timer_active=0;
- timer_expires=(unsigned)-1;
+ if((format == 0) && (sp->state == TCP_TIME_WAIT)) {
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp;
+
+ tw_bucket = 1;
+ timer_active1 = timer_active2 = 0;
+ timer_active = 3;
+ timer_expires = tw->timer.expires;
+ } else {
+ timer_active1 = del_timer(&tp->retransmit_timer);
+ timer_active2 = del_timer(&sp->timer);
+ if (!timer_active1) tp->retransmit_timer.expires=0;
+ if (!timer_active2) sp->timer.expires=0;
+ timer_active = 0;
+ timer_expires = (unsigned) -1;
+ }
if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
- timer_active=timer_active1;
- timer_expires=tp->retransmit_timer.expires;
+ timer_active = 1;
+ timer_expires = tp->retransmit_timer.expires;
}
if (timer_active2 && sp->timer.expires < timer_expires) {
- timer_active=timer_active2;
- timer_expires=sp->timer.expires;
- }
+ timer_active = 2;
+ timer_expires = sp->timer.expires;
+ }
+ if(timer_active == 0)
+ timer_expires = jiffies;
sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
i, src, srcp, dest, destp, sp->state,
- format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc),
- format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc),
- timer_active, timer_expires-jiffies,
- tp->retransmits,
- sp->socket ? sp->socket->inode->i_uid:0,
- timer_active?sp->timeout:0,
- sp->socket ? sp->socket->inode->i_ino:0);
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->write_seq-tp->snd_una : atomic_read(&sp->wmem_alloc)),
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->rcv_nxt-tp->copied_seq: atomic_read(&sp->rmem_alloc)),
+ timer_active, timer_expires-jiffies,
+ (tw_bucket ? 0 : tp->retransmits),
+ (!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0,
+ (!tw_bucket && timer_active) ? sp->timeout : 0,
+ (!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0);
if (timer_active1) add_timer(&tp->retransmit_timer);
if (timer_active2) add_timer(&sp->timer);
diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c
index 9e944495f..e1eba43c5 100644
--- a/net/ipv4/rarp.c
+++ b/net/ipv4/rarp.c
@@ -3,7 +3,7 @@
* Copyright (C) 1994 by Ross Martin
* Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche
*
- * $Id: rarp.c,v 1.3 1997/12/16 05:37:44 ralf Exp $
+ * $Id: rarp.c,v 1.4 1998/03/17 22:18:31 ralf Exp $
*
* This module implements the Reverse Address Resolution Protocol
* (RARP, RFC 903), which is used to convert low level addresses such
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b3644f10d..baebab777 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.3 1997/12/16 05:37:44 ralf Exp $
+ * Version: $Id: raw.c,v 1.4 1998/03/17 22:18:32 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b73c3ed11..8ce4a95f4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.4 1998/03/03 01:23:42 ralf Exp $
+ * Version: $Id: route.c,v 1.5 1998/03/17 22:18:32 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -167,7 +167,7 @@ __u8 ip_tos2prio[16] = {
static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
-static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol);
+static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
@@ -301,6 +301,8 @@ static void rt_run_flush(unsigned long dummy)
int i;
struct rtable * rth, * next;
+ rt_deadline = 0;
+
for (i=0; i<RT_HASH_DIVISOR; i++) {
int nr=0;
@@ -322,37 +324,41 @@ static void rt_run_flush(unsigned long dummy)
void rt_cache_flush(int delay)
{
+ unsigned long now = jiffies;
+ int user_mode = !in_interrupt();
+
if (delay < 0)
delay = ip_rt_min_delay;
start_bh_atomic();
if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
- long tmo = (long)(rt_deadline - rt_flush_timer.expires);
+ long tmo = (long)(rt_deadline - now);
/* If flush timer is already running
and flush request is not immediate (delay > 0):
- if deadline is not achieved, prolongate timer to "dealy",
+ if deadline is not achieved, prolongate timer to "delay",
otherwise fire it at deadline time.
*/
+ if (user_mode && (long)(rt_deadline-now) < ip_rt_max_delay-ip_rt_min_delay)
+ tmo = 0;
+
if (delay > tmo)
delay = tmo;
}
if (delay <= 0) {
- rt_deadline = 0;
end_bh_atomic();
-
rt_run_flush(0);
return;
}
if (rt_deadline == 0)
- rt_deadline = jiffies + ip_rt_max_delay;
+ rt_deadline = now + ip_rt_max_delay;
- rt_flush_timer.expires = jiffies + delay;
+ rt_flush_timer.expires = now + delay;
add_timer(&rt_flush_timer);
end_bh_atomic();
}
@@ -400,7 +406,7 @@ out:
return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size);
}
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol)
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
{
struct rtable *rth, **rthp;
unsigned long now = jiffies;
@@ -472,7 +478,9 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
goto reject_redirect;
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
- if (ip_fib_check_default(new_gw, dev))
+ if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+ goto reject_redirect;
+ if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
goto reject_redirect;
} else {
if (inet_addr_type(new_gw) != RTN_UNICAST)
@@ -504,9 +512,13 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
rth->u.dst.dev != dev)
break;
+ dst_clone(&rth->u.dst);
+
rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
- if (rt == NULL)
+ if (rt == NULL) {
+ ip_rt_put(rth);
return;
+ }
/*
* Copy all the information.
@@ -531,14 +543,16 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
if (rt->u.dst.neighbour)
neigh_event_send(rt->u.dst.neighbour, NULL);
ip_rt_put(rt);
+ ip_rt_put(rth);
rt_free(rt);
break;
}
*rthp = rth->u.rt_next;
- rt_free(rth);
- rt = rt_intern_hash(hash, rt, ETH_P_IP);
+ rt = rt_intern_hash(hash, rt);
ip_rt_put(rt);
+ ip_rt_put(rth);
+ rt_free(rth);
break;
}
}
@@ -762,19 +776,45 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
u32 src;
struct fib_result res;
- if (rt->key.iif == 0) {
- memcpy(addr, &rt->rt_src, 4);
- return;
- }
- if (fib_lookup(&rt->key, &res) == 0) {
+ if (rt->key.iif == 0)
+ src = rt->rt_src;
+ else if (fib_lookup(&rt->key, &res) == 0)
src = FIB_RES_PREFSRC(res);
- memcpy(addr, &src, 4);
- return;
- }
- src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ else
+ src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
memcpy(addr, &src, 4);
}
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+
+ if (fi) {
+ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = FIB_RES_GW(*res);
+#ifndef CONFIG_RTNL_OLD_IFINFO
+ rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
+ rt->u.dst.pmtu = fi->fib_mtu;
+ if (fi->fib_mtu == 0) {
+ rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+ if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
+ rt->rt_gateway != rt->rt_dst &&
+ rt->u.dst.pmtu > 576)
+ rt->u.dst.pmtu = 576;
+ }
+#else
+ rt->u.dst.pmtu = fi->fib_mtu ? : rt->u.dst.dev->mtu;
+#endif
+ rt->u.dst.window= fi->fib_window ? : 0;
+ rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
+ } else {
+ rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+ rt->u.dst.window= 0;
+ rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+ }
+ rt->rt_type = res->type;
+}
+
static int
ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
u8 tos, struct device *dev, int our)
@@ -832,7 +872,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
#endif
hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+ skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
return 0;
}
@@ -990,18 +1030,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
- rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu;
- rth->u.dst.window=res.fi->fib_window ? : 0;
- rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
-#ifndef CONFIG_RTNL_OLD_IFINFO
- rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1];
-#endif
-
- if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
- rth->rt_gateway = FIB_RES_GW(res);
+ rt_set_nexthop(rth, &res);
rth->rt_flags = flags;
- rth->rt_type = res.type;
#ifdef CONFIG_NET_FASTROUTE
if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
@@ -1014,7 +1045,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
}
#endif
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol));
+ skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
return 0;
brd_input:
@@ -1062,7 +1093,7 @@ local_input:
}
rth->rt_flags = flags|RTCF_LOCAL;
rth->rt_type = res.type;
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+ skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
return 0;
no_route:
@@ -1362,7 +1393,7 @@ make_route:
rth->rt_dst_map = key.dst;
rth->rt_src_map = key.src;
#endif
- rth->rt_iif = dev_out->ifindex;
+ rth->rt_iif = oif ? : dev_out->ifindex;
rth->u.dst.dev = dev_out;
rth->rt_gateway = key.dst;
rth->rt_spec_dst= key.src;
@@ -1388,24 +1419,12 @@ make_route:
#endif
}
- if (res.fi) {
- if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
- rth->rt_gateway = FIB_RES_GW(res);
- rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu;
- rth->u.dst.window=res.fi->fib_window ? : 0;
- rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
-#ifndef CONFIG_RTNL_OLD_IFINFO
- rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1];
-#endif
- } else {
- rth->u.dst.pmtu = dev_out->mtu;
- rth->u.dst.window=0;
- rth->u.dst.rtt = TCP_TIMEOUT_INIT;
- }
+ rt_set_nexthop(rth, &res);
+
rth->rt_flags = flags;
- rth->rt_type = res.type;
+
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- *rp = rt_intern_hash(hash, rth, ETH_P_IP);
+ *rp = rt_intern_hash(hash, rth);
return 0;
}
@@ -1444,6 +1463,113 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
#ifdef CONFIG_RTNETLINK
+static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+ struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+ struct rtattr *eptr;
+#endif
+#ifdef CONFIG_RTNL_OLD_IFINFO
+ unsigned char *o;
+#else
+ struct rtattr *mx;
+#endif
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+ r = NLMSG_DATA(nlh);
+ nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0;
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 32;
+ r->rtm_tos = rt->key.tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+ r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+#ifdef CONFIG_RTNL_OLD_IFINFO
+ r->rtm_nhs = 0;
+
+ o = skb->tail;
+#endif
+ RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+ RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src);
+ if (rt->u.dst.dev)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+ if (rt->rt_dst != rt->rt_gateway)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+ RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+ RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+ RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+#else
+ mx = (struct rtattr*)skb->tail;
+ RTA_PUT(skb, RTA_METRICS, 0, NULL);
+ if (rt->u.dst.mxlock)
+ RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
+ if (rt->u.dst.pmtu)
+ RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+ if (rt->u.dst.window)
+ RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+ if (rt->u.dst.rtt)
+ RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+ mx->rta_len = skb->tail - (u8*)mx;
+ if (mx->rta_len == RTA_LENGTH(0))
+ skb_trim(skb, (u8*)mx - skb->data);
+#endif
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+ ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
+ ci.rta_used = atomic_read(&rt->u.dst.refcnt);
+ ci.rta_clntref = atomic_read(&rt->u.dst.use);
+ ci.rta_expires = 0;
+ ci.rta_error = rt->u.dst.error;
+#ifdef CONFIG_IP_MROUTE
+ eptr = (struct rtattr*)skb->tail;
+#endif
+ RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+ r->rtm_optlen = skb->tail - o;
+#endif
+ if (rt->key.iif) {
+#ifdef CONFIG_IP_MROUTE
+ u32 dst = rt->rt_dst;
+
+ if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
+ int err = ipmr_get_route(skb, r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nlmsg_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nlmsg_failure;
+ ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+ }
+ }
+ } else
+#endif
+ {
+ RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+ r->rtm_optlen = skb->tail - o;
+#endif
+ }
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
{
struct rtattr **rta = arg;
@@ -1454,12 +1580,6 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
int iif = 0;
int err;
struct sk_buff *skb;
- struct rta_cacheinfo ci;
-#ifdef CONFIG_RTNL_OLD_IFINFO
- unsigned char *o;
-#else
- struct rtattr *mx;
-#endif
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
@@ -1506,83 +1626,53 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
- RTM_NEWROUTE, sizeof(*rtm));
- rtm = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = 0;
- rtm->rtm_family = AF_INET;
- rtm->rtm_dst_len = 32;
- rtm->rtm_src_len = 32;
- rtm->rtm_tos = rt->key.tos;
- rtm->rtm_table = RT_TABLE_MAIN;
- rtm->rtm_type = rt->rt_type;
- rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = RTPROT_UNSPEC;
- rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
-#ifdef CONFIG_RTNL_OLD_IFINFO
- rtm->rtm_nhs = 0;
+ NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid;
+
+ err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
+ if (err == 0)
+ return 0;
+ if (err < 0)
+ return -EMSGSIZE;
- o = skb->tail;
-#endif
- RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
- RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src);
- if (rt->u.dst.dev)
- RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
- if (rt->rt_dst != rt->rt_gateway)
- RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
-#ifdef CONFIG_RTNL_OLD_IFINFO
- RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
- RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
- RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
- mx = (struct rtattr*)skb->tail;
- RTA_PUT(skb, RTA_METRICS, 0, NULL);
- if (rt->u.dst.mxlock)
- RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
- if (rt->u.dst.pmtu)
- RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
- if (rt->u.dst.window)
- RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
- if (rt->u.dst.rtt)
- RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
- mx->rta_len = skb->tail - (u8*)mx;
-#endif
- RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
- ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
- ci.rta_used = atomic_read(&rt->u.dst.refcnt);
- ci.rta_clntref = atomic_read(&rt->u.dst.use);
- ci.rta_expires = 0;
- ci.rta_error = rt->u.dst.error;
- RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
- rtm->rtm_optlen = skb->tail - o;
-#endif
- if (iif) {
-#ifdef CONFIG_IP_MROUTE
- if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
- NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid;
- err = ipmr_get_route(skb, rtm);
- if (err <= 0)
- return err;
- } else
-#endif
- {
- RTA_PUT(skb, RTA_IIF, sizeof(int), &iif);
-#ifdef CONFIG_RTNL_OLD_IFINFO
- rtm->rtm_optlen = skb->tail - o;
-#endif
- }
- }
- nlh->nlmsg_len = skb->tail - (u8*)nlh;
err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
if (err < 0)
return err;
return 0;
+}
-nlmsg_failure:
-rtattr_failure:
- kfree_skb(skb);
- return -EMSGSIZE;
+
+int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtable *rt;
+ int h, s_h;
+ int idx, s_idx;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ for (h=0; h < RT_HASH_DIVISOR; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int));
+ start_bh_atomic();
+ for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
+ if (idx < s_idx)
+ continue;
+ skb->dst = dst_clone(&rt->u.dst);
+ if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
+ dst_release(xchg(&skb->dst, NULL));
+ end_bh_atomic();
+ goto done;
+ }
+ dst_release(xchg(&skb->dst, NULL));
+ }
+ end_bh_atomic();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ return skb->len;
}
#endif /* CONFIG_RTNETLINK */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 7d119716e..00dd0a8ef 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $
+ * $Id: syncookies.c,v 1.4 1998/03/08 05:56:34 davem Exp $
*
* Missing: IPv6 support.
* Some counter so that the Administrator can see when the machine
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a8a7efb4..767c5d00b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,7 +1,7 @@
/*
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
*
- * $Id: sysctl_net_ipv4.c,v 1.6 1998/03/03 01:23:42 ralf Exp $
+ * $Id: sysctl_net_ipv4.c,v 1.7 1998/03/17 22:18:33 ralf Exp $
*
* Begun April 1, 1996, Mike Shaver.
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
@@ -45,8 +45,6 @@ extern int sysctl_ip_masq_debug;
extern int sysctl_tcp_cong_avoidance;
extern int sysctl_tcp_hoe_retransmits;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_keepalive_time;
@@ -57,7 +55,8 @@ extern int sysctl_tcp_retries2;
extern int sysctl_tcp_fin_timeout;
extern int sysctl_tcp_syncookies;
extern int sysctl_tcp_syn_retries;
-extern int sysctl_tcp_stdurg;
+extern int sysctl_tcp_stdurg;
+extern int sysctl_tcp_rfc1337;
extern int sysctl_tcp_syn_taildrop;
extern int sysctl_max_syn_backlog;
@@ -99,12 +98,6 @@ ctl_table ipv4_table[] = {
{NET_IPV4_TCP_HOE_RETRANSMITS, "tcp_hoe_retransmits",
&sysctl_tcp_hoe_retransmits, sizeof(int), 0644, NULL,
&proc_dointvec},
- {NET_IPV4_TCP_SACK, "tcp_sack",
- &sysctl_tcp_sack, sizeof(int), 0644, NULL,
- &proc_dointvec},
- {NET_IPV4_TCP_TSACK, "tcp_tsack",
- &sysctl_tcp_tsack, sizeof(int), 0644, NULL,
- &proc_dointvec},
{NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps",
&sysctl_tcp_timestamps, sizeof(int), 0644, NULL,
&proc_dointvec},
@@ -162,6 +155,8 @@ ctl_table ipv4_table[] = {
#endif
{NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
+ sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_SYN_TAILDROP, "tcp_syn_taildrop", &sysctl_tcp_syn_taildrop,
sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 17ec6def9..b20df83d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.77 1998/01/15 22:40:18 freitag Exp $
+ * Version: $Id: tcp.c,v 1.96 1998/03/16 02:25:55 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -169,7 +169,7 @@
* Fixed tcp_write_timeout: stuck close,
* and TCP syn retries gets used now.
* Mark Yarvis : In tcp_read_wakeup(), don't send an
- * ack if stat is TCP_CLOSED.
+ * ack if state is TCP_CLOSED.
* Alan Cox : Look up device on a retransmit - routes may
* change. Doesn't yet cope with MSS shrink right
* but its a start!
@@ -425,6 +425,8 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
struct tcp_mib tcp_statistics;
kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
/*
* Find someone to 'accept'. Must be called with
@@ -478,20 +480,6 @@ static void tcp_close_pending (struct sock *sk)
}
/*
- * Enter the time wait state.
- */
-
-void tcp_time_wait(struct sock *sk)
-{
- tcp_set_state(sk,TCP_TIME_WAIT);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-}
-
-
-/*
* Walk down the receive queue counting readable data.
*
* Must be called with the socket lock held.
@@ -512,7 +500,7 @@ static int tcp_readable(struct sock *sk)
return(0);
}
- counted = sk->copied_seq; /* Where we are at the moment */
+ counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */
amount = 0;
/* Do until a push or until we are out of data. */
@@ -606,10 +594,10 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
if (sk->shutdown & RCV_SHUTDOWN)
mask |= POLLHUP;
- if ((tp->rcv_nxt != sk->copied_seq) &&
- (sk->urg_seq != sk->copied_seq ||
- tp->rcv_nxt != sk->copied_seq+1 ||
- sk->urginline || !sk->urg_data))
+ if ((tp->rcv_nxt != tp->copied_seq) &&
+ (tp->urg_seq != tp->copied_seq ||
+ tp->rcv_nxt != tp->copied_seq+1 ||
+ sk->urginline || !tp->urg_data))
mask |= POLLIN | POLLRDNORM;
#if 1 /* This needs benchmarking and real world tests */
@@ -621,9 +609,9 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
space = atomic_read(&sk->wmem_alloc) / 2;
#endif
/* Always wake the user up when an error occured */
- if (sock_wspace(sk) >= space)
+ if (sock_wspace(sk) >= space || sk->err)
mask |= POLLOUT | POLLWRNORM;
- if (sk->urg_data)
+ if (tp->urg_data)
mask |= POLLPRI;
}
return mask;
@@ -649,7 +637,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
case SIOCATMARK:
{
- int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
return put_user(answ,(int *) arg);
}
case TIOCOUTQ:
@@ -669,21 +658,38 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/*
* Wait for a socket to get into the connected state
*/
-static void wait_for_tcp_connect(struct sock * sk)
+static int wait_for_tcp_connect(struct sock * sk, int flags)
{
struct task_struct *tsk = current;
struct wait_queue wait = { tsk, NULL };
- tsk->state = TASK_INTERRUPTIBLE;
- add_wait_queue(sk->sleep, &wait);
- release_sock(sk);
+ while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+ if(sk->err)
+ return sock_error(sk);
+ if((1 << sk->state) &
+ ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ if(sk->keepopen)
+ send_sig(SIGPIPE, tsk, 0);
+ return -EPIPE;
+ }
+ if(flags & MSG_DONTWAIT)
+ return -EAGAIN;
+ if(signal_pending(tsk))
+ return -ERESTARTSYS;
- if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && sk->err == 0)
- schedule();
+ tsk->state = TASK_INTERRUPTIBLE;
+ add_wait_queue(sk->sleep, &wait);
+ release_sock(sk);
- tsk->state = TASK_RUNNING;
- remove_wait_queue(sk->sleep, &wait);
- lock_sock(sk);
+ if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
+ sk->err == 0)
+ schedule();
+
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+ lock_sock(sk);
+ }
+ return 0;
}
static inline int tcp_memory_free(struct sock *sk)
@@ -720,32 +726,6 @@ static void wait_for_tcp_memory(struct sock * sk)
lock_sock(sk);
}
-
-static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from,
- int tcp_size, int seglen)
-{
- int fault;
- int copy;
-
- /* Add more stuff to the end of the skb. */
- copy = min(sk->mss - tcp_size, skb_tailroom(skb));
- copy = min(copy, seglen);
-
- tcp_size += copy;
-
- fault = copy_from_user(skb->tail, from, copy);
- if (fault)
- return -1;
-
- skb_put(skb, copy);
- skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
-
- sk->write_seq += copy;
- skb->end_seq += copy;
-
- return copy;
-}
-
/*
* This routine copies from a user buffer into a socket,
* and starts the transmit system.
@@ -758,24 +738,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
/* Wait for a connection to finish. */
- while ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
- if (sk->err)
- return sock_error(sk);
-
- if ((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (sk->keepopen)
- send_sig(SIGPIPE, current, 0);
- return -EPIPE;
- }
-
- if (flags&MSG_DONTWAIT)
- return -EAGAIN;
-
- if (signal_pending(current))
- return -ERESTARTSYS;
-
- wait_for_tcp_connect(sk);
- }
+ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ if((err = wait_for_tcp_connect(sk, flags)) != 0)
+ return err;
/* Ok commence sending. */
while(--iovlen >= 0) {
@@ -785,41 +750,28 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
iov++;
while(seglen > 0) {
- unsigned int actual_win;
- int copy;
- int tmp;
+ int copy, tmp, queue_it;
struct sk_buff *skb;
if (err)
return -EFAULT;
/* Stop on errors. */
- if (sk->err) {
- if (copied)
- return copied;
- return sock_error(sk);
- }
+ if (sk->err)
+ goto do_sock_err;
/* Make sure that we are established. */
- if (sk->shutdown & SEND_SHUTDOWN) {
- if (copied)
- return copied;
- send_sig(SIGPIPE,current,0);
- return -EPIPE;
- }
+ if (sk->shutdown & SEND_SHUTDOWN)
+ goto do_shutdown;
- /* Now we need to check if we have a half built packet. */
-
- /* If we have queued packets.. */
+ /* Now we need to check if we have a half
+ * built packet we can tack some data onto.
+ */
if (tp->send_head && !(flags & MSG_OOB)) {
- int tcp_size;
-
- /* Tail */
-
skb = sk->write_queue.prev;
- tcp_size = skb->tail -
- ((unsigned char *)(skb->h.th) + tp->tcp_header_len);
-
+ copy = skb->tail -
+ ((unsigned char *)(skb->h.th) +
+ tp->tcp_header_len);
/* This window_seq test is somewhat dangerous
* If the remote does SWS avoidance we should
* queue the best we can if not we should in
@@ -827,79 +779,92 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
* a method for detecting this would be most
* welcome
*/
- if (skb->end > skb->tail &&
- sk->mss - tcp_size > 0 &&
+ if (skb_tailroom(skb) > 0 &&
+ (sk->mss - copy) > 0 &&
tp->snd_nxt < skb->end_seq) {
- int tcopy;
-
- tcopy = tcp_append_tail(sk, skb, from,
- tcp_size,
- seglen);
- if (tcopy == -1)
- return -EFAULT;
-
- from += tcopy;
- copied += tcopy;
- seglen -= tcopy;
-
- /* FIXME: if we're nagling we
- * should send here.
- */
+ int last_byte_was_odd = (copy & 1);
+
+ copy = sk->mss - copy;
+ if(copy > skb_tailroom(skb))
+ copy = skb_tailroom(skb);
+ if(copy > seglen)
+ copy = seglen;
+ if(last_byte_was_odd) {
+ if(copy_from_user(skb_put(skb, copy),
+ from, copy))
+ err = -EFAULT;
+ skb->csum = csum_partial(
+ (((unsigned char *)skb->h.th) +
+ tp->tcp_header_len),
+ (skb->tail -
+ (((unsigned char *)skb->h.th) +
+ tp->tcp_header_len)), 0);
+ } else {
+ skb->csum =
+ csum_and_copy_from_user(
+ from, skb_put(skb, copy),
+ copy, skb->csum, &err);
+ }
+ tp->write_seq += copy;
+ skb->end_seq += copy;
+ from += copy;
+ copied += copy;
+ seglen -= copy;
continue;
}
}
- /* We also need to worry about the window.
- * If window < 1/2 the maximum window we've seen from this
- * host, don't use it. This is sender side
- * silly window prevention, as specified in RFC1122.
- * (Note that this is different than earlier versions of
- * SWS prevention, e.g. RFC813.). What we actually do is
- * use the whole MSS. Since the results in the right
- * edge of the packet being outside the window, it will
- * be queued for later rather than sent.
+ /* We also need to worry about the window. If
+ * window < 1/2 the maximum window we've seen
+ * from this host, don't use it. This is
+ * sender side silly window prevention, as
+ * specified in RFC1122. (Note that this is
+ * different than earlier versions of SWS
+ * prevention, e.g. RFC813.). What we
+ * actually do is use the whole MSS. Since
+ * the results in the right edge of the packet
+ * being outside the window, it will be queued
+ * for later rather than sent.
*/
- copy = min(seglen, sk->mss);
- actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
-
- if (copy > actual_win &&
- (((int) actual_win) >= (tp->max_window >> 1)) &&
- actual_win)
- copy = actual_win;
-
- if (copy <= 0) {
- printk(KERN_DEBUG "sendmsg: copy < 0\n");
- return -EIO;
- }
+ copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
+ if(copy >= (tp->max_window >> 1))
+ copy = min(copy, sk->mss);
+ else
+ copy = sk->mss;
+ if(copy > seglen)
+ copy = seglen;
- /* If tp->packets_out > 0 segment will be nagled
- * else we kick it right away.
- */
- tmp = MAX_HEADER + sk->prot->max_header +
+ tmp = MAX_HEADER + sk->prot->max_header +
sizeof(struct sk_buff) + 15;
- if (copy < min(sk->mss, tp->max_window >> 1) &&
- !(flags & MSG_OOB) && tp->packets_out)
+ queue_it = 0;
+ if (copy < min(sk->mss, tp->max_window >> 1) &&
+ !(flags & MSG_OOB)) {
tmp += min(sk->mss, tp->max_window);
- else
- tmp += copy;
+ /* What is happening here is that we want to
+ * tack on later members of the users iovec
+ * if possible into a single frame. When we
+ * leave this loop our caller checks to see if
+ * we can send queued frames onto the wire.
+ * See tcp_v[46]_sendmsg() for this.
+ */
+ queue_it = 1;
+ } else {
+ tmp += copy;
+ }
skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
/* If we didn't get any memory, we need to sleep. */
if (skb == NULL) {
sk->socket->flags |= SO_NOSPACE;
if (flags&MSG_DONTWAIT) {
- if (copied)
- return copied;
- return -EAGAIN;
+ err = -EAGAIN;
+ goto do_interrupted;
}
-
if (signal_pending(current)) {
- if (copied)
- return copied;
- return -ERESTARTSYS;
+ err = -ERESTARTSYS;
+ goto do_interrupted;
}
-
wait_for_tcp_memory(sk);
continue;
}
@@ -910,9 +875,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
tmp = tp->af_specific->build_net_header(sk, skb);
if (tmp < 0) {
kfree_skb(skb);
- if (copied)
- return(copied);
- return(tmp);
+ err = tmp;
+ goto do_interrupted;
}
skb->h.th =(struct tcphdr *)
@@ -920,7 +884,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
seglen -= copy;
tcp_build_header_data(skb->h.th, sk, seglen || iovlen);
- /* FIXME: still need to think about SACK options here. */
if (flags & MSG_OOB) {
skb->h.th->urg = 1;
@@ -933,21 +896,29 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
from += copy;
copied += copy;
- sk->write_seq += copy;
+ tp->write_seq += copy;
- tcp_send_skb(sk, skb);
-
- release_sock(sk);
- lock_sock(sk);
+ tcp_send_skb(sk, skb, queue_it);
}
}
-
sk->err = 0;
-
if (err)
return -EFAULT;
-
return copied;
+
+do_sock_err:
+ if(copied)
+ return copied;
+ return sock_error(sk);
+do_shutdown:
+ if(copied)
+ return copied;
+ send_sig(SIGPIPE, current, 0);
+ return -EPIPE;
+do_interrupted:
+ if(copied)
+ return copied;
+ return err;
}
/*
@@ -980,7 +951,7 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* No URG data to read. */
- if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
+ if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->err)
@@ -1000,18 +971,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
}
lock_sock(sk);
- if (sk->urg_data & URG_VALID) {
- char c = sk->urg_data;
+ if (tp->urg_data & URG_VALID) {
+ char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- sk->urg_data = URG_READ;
-
- if(len>0)
- {
- err = memcpy_toiovec(msg->msg_iov, &c, 1);
- msg->msg_flags|=MSG_OOB;
- }
- else
- msg->msg_flags|=MSG_TRUNC;
+ tp->urg_data = URG_READ;
if(msg->msg_name)
tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
@@ -1023,6 +986,15 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
/* Read urgent data. */
msg->msg_flags|=MSG_OOB;
release_sock(sk);
+
+ if(len>0)
+ {
+ err = memcpy_toiovec(msg->msg_iov, &c, 1);
+ msg->msg_flags|=MSG_OOB;
+ }
+ else
+ msg->msg_flags|=MSG_TRUNC;
+
return err ? -EFAULT : 1;
}
release_sock(sk);
@@ -1044,45 +1016,37 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
- sk->tp_pinfo.af_tcp.delayed_acks++;
-
__skb_unlink(skb, &sk->receive_queue);
kfree_skb(skb);
}
-
-static void cleanup_rbuf(struct sock *sk)
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary. COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
{
struct sk_buff *skb;
- struct tcp_opt *tp;
/* NOTE! The socket must be locked, so that we don't get
* a messed-up receive queue.
*/
while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
- if (!skb->used || atomic_read(&skb->users)>1)
+ if (!skb->used || atomic_read(&skb->users) > 1)
break;
tcp_eat_skb(sk, skb);
}
SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk));
- tp = &(sk->tp_pinfo.af_tcp);
-
- /* We send a ACK if the sender is blocked
- * else let tcp_data deal with the acking policy.
+ /* We send an ACK if we can now advertise a non-zero window
+ * which has been raised "significantly".
*/
- if (tp->delayed_acks) {
- __u32 rcv_wnd;
-
- /* FIXME: double check this rule, then check against
- * other use of similar rules. Abtract if possible.
- */
- rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
-
- if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd))
- tcp_read_wakeup(sk);
- }
+ if((copied > 0) &&
+ (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp)))
+ tcp_read_wakeup(sk);
}
@@ -1100,7 +1064,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
volatile u32 *seq; /* So gcc doesn't overoptimise */
unsigned long used;
int err = 0;
- int target = 1; /* Read at least this may bytes */
+ int target = 1; /* Read at least this many bytes */
if (sk->state == TCP_LISTEN)
return -ENOTCONN;
@@ -1113,8 +1077,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
* the multi-reader case neatly (memcpy_to/fromfs might be
* inline and thus not flush cached variables otherwise).
*/
- peek_seq = sk->copied_seq;
- seq = &sk->copied_seq;
+ peek_seq = tp->copied_seq;
+ seq = &tp->copied_seq;
if (flags & MSG_PEEK)
seq = &peek_seq;
@@ -1129,7 +1093,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
u32 offset;
/* Are we at urgent data? Stop if we have read anything. */
- if (copied && sk->urg_data && sk->urg_seq == *seq)
+ if (copied && tp->urg_data && tp->urg_seq == *seq)
break;
/* We need to check signals first, to get correct SIGURG
@@ -1200,7 +1164,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
break;
}
- cleanup_rbuf(sk);
+ cleanup_rbuf(sk, copied);
release_sock(sk);
sk->socket->flags |= SO_WAITDATA;
schedule();
@@ -1222,8 +1186,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
used = len;
/* Do we have urgent data here? */
- if (sk->urg_data) {
- u32 urg_offset = sk->urg_seq - *seq;
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sk->urginline) {
@@ -1264,8 +1228,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
*/
atomic_dec(&skb->users);
- if (after(sk->copied_seq,sk->urg_seq))
- sk->urg_data = 0;
+ if (after(tp->copied_seq,tp->urg_seq))
+ tp->urg_data = 0;
if (used + offset < skb->len)
continue;
@@ -1303,7 +1267,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
current->state = TASK_RUNNING;
/* Clean up data we have read: This will do ACK frames. */
- cleanup_rbuf(sk);
+ cleanup_rbuf(sk, copied);
release_sock(sk);
return copied;
}
@@ -1356,8 +1320,7 @@ static int tcp_close_state(struct sock *sk, int dead)
* reset mistake.
*/
if(dead && ns==TCP_FIN_WAIT2) {
- int timer_active=del_timer(&sk->timer);
- if(timer_active)
+ if(sk->timer.prev && del_timer(&sk->timer))
add_timer(&sk->timer);
else
tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
@@ -1410,6 +1373,7 @@ static inline int closing(struct sock * sk)
void tcp_close(struct sock *sk, unsigned long timeout)
{
struct sk_buff *skb;
+ int data_was_unread = 0;
/* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
@@ -1421,7 +1385,6 @@ void tcp_close(struct sock *sk, unsigned long timeout)
tcp_close_pending(sk);
release_sock(sk);
sk->dead = 1;
- sk->prot->unhash(sk);
return;
}
@@ -1435,14 +1398,30 @@ void tcp_close(struct sock *sk, unsigned long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
+ while((skb=skb_dequeue(&sk->receive_queue))!=NULL) {
+ data_was_unread++;
kfree_skb(skb);
+ }
- /* Timeout is not the same thing - however the code likes
- * to send both the same way (sigh).
+ /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+ * 3.10, we send a RST here because data was lost. To
+ * witness the awful effects of the old behavior of always
+ * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+ * a bulk GET in an FTP client, suspend the process, wait
+ * for the client to advertise a zero window, then kill -9
+ * the FTP client, wheee... Note: timeout is always zero
+ * in such a case.
*/
- if (tcp_close_state(sk,1)==1)
+ if(data_was_unread != 0) {
+ /* Unread data was tossed, zap the connection. */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk);
+ } else if (tcp_close_state(sk,1)) {
+ /* We FIN if the application ate all the data before
+ * zapping the connection.
+ */
tcp_send_fin(sk);
+ }
if (timeout) {
struct task_struct *tsk = current;
@@ -1470,8 +1449,7 @@ void tcp_close(struct sock *sk, unsigned long timeout)
* we may need to set up a timer.
*/
if (sk->state==TCP_FIN_WAIT2) {
- int timer_active=del_timer(&sk->timer);
- if(timer_active)
+ if(sk->timer.prev && del_timer(&sk->timer))
add_timer(&sk->timer);
else
tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
@@ -1479,9 +1457,6 @@ void tcp_close(struct sock *sk, unsigned long timeout)
sk->dead = 1;
release_sock(sk);
-
- if(sk->state == TCP_CLOSE)
- sk->prot->unhash(sk);
}
/*
@@ -1538,13 +1513,12 @@ struct sock *tcp_accept(struct sock *sk, int flags)
/* If this is a non blocking socket don't sleep */
error = EAGAIN;
if (flags & O_NONBLOCK)
- goto out;
+ goto out;
error = ERESTARTSYS;
req = wait_for_connect(sk, &prev);
if (!req)
- goto out;
- error = 0;
+ goto out;
}
tcp_synq_unlink(tp, req, prev);
@@ -1647,9 +1621,23 @@ void tcp_set_keepalive(struct sock *sk, int val)
__initfunc(void tcp_init(void))
{
tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
- sizeof(struct open_request),
+ sizeof(struct open_request),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
if(!tcp_openreq_cachep)
panic("tcp_init: Cannot alloc open_request cache.");
+
+ tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct tcp_bind_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_bucket_cachep)
+ panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+ tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+ sizeof(struct tcp_tw_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_timewait_cachep)
+ panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 841359739..4b7dcc9e9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $
+ * Version: $Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -67,57 +67,54 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
extern int sysctl_tcp_fin_timeout;
+/* These are on by default so the code paths get tested.
+ * For the final 2.2 this may be undone at our discretion. -DaveM
+ */
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+
int sysctl_tcp_cong_avoidance;
int sysctl_tcp_hoe_retransmits;
-int sysctl_tcp_sack;
-int sysctl_tcp_tsack;
-int sysctl_tcp_timestamps;
-int sysctl_tcp_window_scaling;
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
-/*
- * Called each time to estimate the delayed ack timeout. This is
- * how it should be done so a fast link isnt impacted by ack delay.
- *
- * I think we need a medium deviation here also...
- * The estimated value is changing to fast
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
*/
-
static void tcp_delack_estimator(struct tcp_opt *tp)
{
- int m;
-
- /* Delayed ACK time estimator. */
-
- m = jiffies - tp->lrcvtime;
-
- tp->lrcvtime = jiffies;
+ if(tp->ato == 0) {
+ tp->lrcvtime = jiffies;
- if (m < 0)
- return;
-
- /* if the mesured value is bigger than
- * twice the round trip time ignore it.
- */
- if ((m << 2) <= tp->srtt) {
- m -= (tp->iat >> 3);
- tp->iat += m;
-
- if (m <0)
- m = -m;
-
- m -= (tp->iat_mdev >> 2);
- tp->iat_mdev += m;
+ /* Help sender leave slow start quickly,
+ * this sets our initial ato value.
+ */
+ tcp_enter_quickack_mode(tp);
+ } else {
+ int m = jiffies - tp->lrcvtime;
- tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2);
+ tp->lrcvtime = jiffies;
+ if(m <= 0)
+ m = 1;
+ if(m > tp->rto)
+ tp->ato = tp->rto;
+ else
+ tp->ato = (tp->ato >> 1) + m;
- if (tp->ato < HZ/50)
- tp->ato = HZ/50;
- } else
- tp->ato = 0;
+ /* We are not in "quick ack" mode. */
+ if(tp->ato <= (HZ/100))
+ tp->ato = ((HZ/100)*2);
+ }
}
/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -132,9 +129,9 @@ static void tcp_delack_estimator(struct tcp_opt *tp)
static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
- long m;
- /*
- * The following amusing code comes from Jacobson's
+ long m = mrtt; /* RTT */
+
+ /* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
@@ -143,12 +140,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
* On a 1990 paper the rto value is changed to:
* RTO = rtt + 4 * mdev
*/
-
- m = mrtt; /* RTT */
-
+ if(m == 0)
+ m = 1;
if (tp->srtt != 0) {
- if(m<=0)
- m=1; /* IS THIS RIGHT FOR <0 ??? */
m -= (tp->srtt >> 3); /* m is now error in rtt est */
tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0)
@@ -202,19 +196,17 @@ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
*/
if (!before(end_seq,tp->last_ack_sent)) {
tp->ts_recent = tp->rcv_tsval;
- /* FIXME: need a corse timestamp. Days uptime
- * would be good.
- */
tp->ts_recent_stamp = jiffies;
}
}
+#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
+
extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
{
- /* FIXME: must check that ts_recent is not
- * more than 24 days old here. Yuck.
- */
- return ((s32)(tp->rcv_tsval-tp->ts_recent) < 0);
+ /* ts_recent must be younger than 24 days */
+ return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
+ ((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
}
@@ -257,8 +249,6 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb)
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->state) {
- case TCP_TIME_WAIT:
- break;
case TCP_SYN_SENT:
sk->err = ECONNREFUSED;
break;
@@ -268,23 +258,8 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb)
default:
sk->err = ECONNRESET;
};
-#ifdef CONFIG_TCP_RFC1337
- /*
- * Time wait assassination protection [RFC1337]
- *
- * This is a good idea, but causes more sockets to take time to close.
- *
- * Ian Heavens has since shown this is an inadequate fix for the protocol
- * bug in question.
- */
- if(sk->state!=TCP_TIME_WAIT) {
- tcp_set_state(sk,TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- }
-#else
tcp_set_state(sk,TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
-#endif
if (!sk->dead)
sk->state_change(sk);
}
@@ -302,7 +277,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
int length=(th->doff*4)-sizeof(struct tcphdr);
ptr = (unsigned char *)(th + 1);
- tp->sacks = 0;
tp->saw_tstamp = 0;
while(length>0) {
@@ -336,10 +310,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
tp->snd_wscale = *(__u8 *)ptr;
}
break;
- case TCPOPT_SACK_PERM:
- if(opsize==TCPOLEN_SACK_PERM && th->syn)
- if (sysctl_tcp_sack && !no_fancy)
- tp->sack_ok = 1;
case TCPOPT_TIMESTAMP:
if(opsize==TCPOLEN_TIMESTAMP) {
/* Cheaper to set again then to
@@ -353,18 +323,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
}
}
break;
- case TCPOPT_SACK:
- if (no_fancy || !sysctl_tcp_sack)
- break;
- tp->sacks = (opsize-2)>>3;
- if (tp->sacks<<3 == opsize-2) {
- int i;
- for (i = 0; i < tp->sacks; i++) {
- tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]);
- tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]);
- }
- } else
- tp->sacks = 0;
}
ptr+=opsize-2;
length-=opsize;
@@ -374,7 +332,7 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_option().
- * This should probably get extended for timestamps + SACK as well.
+ * This should probably get extended for timestamps as well.
* Assembly code anyone? -- erics
*/
static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp)
@@ -384,14 +342,12 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *
return 0;
if (th->doff == sizeof(struct tcphdr)>>2) {
tp->saw_tstamp = 0;
- tp->sacks = 0;
return 0;
- } else if (th->doff == (sizeof(struct tcphdr)>>2)+3) {
+ } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
__u32 *ptr = (__u32 *)(th + 1);
- if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->saw_tstamp = 1;
- tp->sacks = 0;
tp->rcv_tsval = ntohl(*++ptr);
tp->rcv_tsecr = ntohl(*++ptr);
return 1;
@@ -401,89 +357,6 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *
return 1;
}
-#if 0
-
-/*
- * This is the old fast retransmit code. It will go away eventually. -- erics
- */
-
-/*
- * See draft-stevens-tcpca-spec-01 for documentation.
- */
-
-static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
-{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
-
- /* FIXME: if we are already retransmitting should this code
- * be skipped? [Floyd high_seq check sort of does this]
- * The case I'm worried about is falling into a fast
- * retransmit on a link with a congestion window of 1 or 2.
- * There was some evidence in 2.0.x that this was problem
- * on really slow links (1200 or 2400 baud). I need to
- * try this situation again and see what happens.
- */
-
- /*
- * An ACK is a duplicate if:
- * (1) it has the same sequence number as the largest number we've
- * seen,
- * (2) it has the same window as the last ACK,
- * (3) we have outstanding data that has not been ACKed
- * (4) The packet was not carrying any data.
- * (5) [From Floyds paper on fast retransmit wars]
- * The packet acked data after high_seq;
- */
-
- if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
- /* 1. When the third duplicate ack is received, set ssthresh
- * to one half the current congestion window, but no less
- * than two segments. Retransmit the missing segment.
- */
- if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
- tp->dup_acks++;
-
- if (tp->dup_acks == 3) {
- tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
- tp->snd_cwnd = tp->snd_ssthresh + 3;
- tcp_do_retransmit(sk, 0);
-
- /* Careful not to timeout just after fast
- * retransmit!
- */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- }
- }
-
- /* 2. Each time another duplicate ACK arrives, increment
- * cwnd by the segment size. [...] Transmit a packet...
- *
- * Packet transmission will be done on normal flow processing
- * since we're not in "retransmit mode".
- */
- if (tp->dup_acks >= 3) {
- tp->dup_acks++;
- tp->snd_cwnd++;
- }
- } else {
- /* 3. When the next ACK arrives that acknowledges new data,
- * set cwnd to ssthresh.
- */
- if (tp->dup_acks >= 3) {
- tp->retrans_head = NULL;
- tp->snd_cwnd = max(tp->snd_ssthresh, 1);
- tp->retransmits = 0;
- }
- tp->dup_acks = 0;
-
- /* FIXME: This is wrong if the new ack that arrives
- * is below the value for high_seq.
- */
- tp->high_seq = 0;
- }
-}
-#endif
-
#define FLAG_DATA 0x01
#define FLAG_WIN_UPDATE 0x02
#define FLAG_DATA_ACKED 0x04
@@ -579,9 +452,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
* not indicate a packet left the system.
* We can test this by just checking
* if ack changed from snd_una, since
- * the only way to get here without changing
- * advancing from snd_una is if this was a
- * window update.
+ * the only way to get here without advancing
+ * from snd_una is if this was a window update.
*/
if (ack != tp->snd_una && before(ack,tp->high_seq)) {
tcp_do_retransmit(sk, 0);
@@ -596,9 +468,6 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
clear_fast_retransmit(sk);
}
}
- } else {
- /* Clear any aborted fast retransmit starts. */
- tp->dup_acks = 0;
}
}
@@ -649,7 +518,6 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
- /* XXX sk->mss should move into tcp_opt as well -DaveM */
inv_basebd = sk->mss * inv_basertt;
/* Slow Start */
@@ -731,13 +599,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
int acked = 0;
while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
-#ifdef TCP_DEBUG
- /* Check for a bug. */
- if (skb->next != (struct sk_buff*) &sk->write_queue &&
- after(skb->end_seq, skb->next->seq))
- printk(KERN_DEBUG "INET: tcp_input.c: *** "
- "bug send_list out of order.\n");
-#endif
/* If our packet is before the ack sequence we can
* discard it as it's confirmed to have arrived the
* other end.
@@ -745,12 +606,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
if (after(skb->end_seq, ack))
break;
-#if 0
- SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n",
- skb->seq, skb->end_seq);
-#endif
-
- acked = FLAG_DATA_ACKED;
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if(!skb->h.th->syn)
+ acked = FLAG_DATA_ACKED;
/* FIXME: packet counting may break if we have to
* do packet "repackaging" for stacks that don't
@@ -766,11 +630,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
kfree_skb(skb);
}
- if (acked) {
+ if (acked)
tp->retrans_head = NULL;
- if (!sk->dead)
- sk->write_space(sk);
- }
+
return acked;
}
@@ -795,6 +657,66 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
}
}
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
+ u32 seq, u32 ack, int flag)
+{
+ __u32 seq_rtt = (jiffies-tp->rcv_tsecr);
+ tcp_rtt_estimator(tp, seq_rtt);
+ if (tp->retransmits) {
+ if (tp->packets_out == 0) {
+ tp->retransmits = 0;
+ tp->backoff = 0;
+ tcp_set_rto(tp);
+ } else {
+ /* Still retransmitting, use backoff */
+ tcp_set_rto(tp);
+ tp->rto = tp->rto << tp->backoff;
+ }
+ } else {
+ tcp_set_rto(tp);
+ if (flag & FLAG_DATA_ACKED)
+ (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ }
+ /* NOTE: safe here so long as cong_ctl doesn't use rto */
+ tcp_bound_rto(tp);
+}
+
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb;
+ long when;
+
+ skb = skb_peek(&sk->write_queue);
+ when = tp->rto - (jiffies - skb->when);
+
+ /* FIXME: This assumes that when we are retransmitting
+ * we should only ever respond with one packet.
+ * This means congestion windows should not grow
+ * during recovery. In 2.0.X we allow the congestion
+ * window to grow. It is not clear to me which
+ * decision is correct. The RFCs should be double
+ * checked as should the behavior of other stacks.
+ * Also note that if we do want to allow the
+ * congestion window to grow during retransmits
+ * we have to fix the call to congestion window
+ * updates so that it works during retransmission.
+ */
+ if (tp->retransmits) {
+ tp->retrans_head = NULL;
+
+ /* This is tricky. We are retransmiting a
+ * segment of a window when congestion occured.
+ */
+ tcp_do_retransmit(sk, 0);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else {
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+ }
+}
+
/*
* This routine deals with incoming acks, but not outgoing ones.
*/
@@ -806,7 +728,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
int flag = 0;
u32 seq = 0;
u32 seq_rtt = 0;
- struct sk_buff *skb;
if(sk->zapped)
return(1); /* Dead, can't ack any more so why bother */
@@ -838,7 +759,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
*/
if (before(tp->snd_wl1, ack_seq) ||
(tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
- unsigned long nwin = ntohs(th->window) << tp->snd_wscale;
+ u32 nwin = ntohs(th->window) << tp->snd_wscale;
if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
flag |= FLAG_WIN_UPDATE;
@@ -869,28 +790,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
/* If we have a timestamp, we always do rtt estimates. */
if (tp->saw_tstamp) {
- /* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Superceeds RFC1323)
- */
- seq_rtt = (jiffies-tp->rcv_tsecr);
- tcp_rtt_estimator(tp, seq_rtt);
- if (tp->retransmits) {
- if (tp->packets_out == 0) {
- tp->retransmits = 0;
- tp->backoff = 0;
- tcp_set_rto(tp);
- } else {
- /* Still retransmitting, use backoff */
- tcp_set_rto(tp);
- tp->rto = tp->rto << tp->backoff;
- }
- } else {
- tcp_set_rto(tp);
- if (flag & FLAG_DATA_ACKED)
- (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
- }
- /* NOTE: safe here so long as cong_ctl doesn't use rto */
- tcp_bound_rto(tp);
+ tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
} else {
/* If we were retransmiting don't count rtt estimate. */
if (tp->retransmits) {
@@ -916,51 +816,217 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
}
if (tp->packets_out) {
- if (flag & FLAG_DATA_ACKED) {
- long when;
-
- skb = skb_peek(&sk->write_queue);
- when = tp->rto - (jiffies - skb->when);
-
- /* FIXME: This assumes that when we are retransmitting
- * we should only ever respond with one packet.
- * This means congestion windows should not grow
- * during recovery. In 2.0.X we allow the congestion
- * window to grow. It is not clear to me which
- * decision is correct. The RFCs should be double
- * checked as should the behavior of other stacks.
- * Also note that if we do want to allow the
- * congestion window to grow during retransmits
- * we have to fix the call to congestion window
- * updates so that it works during retransmission.
- */
- if (tp->retransmits) {
- tp->retrans_head = NULL;
-
- /* This is tricky. We are retransmiting a
- * segment of a window when congestion occured.
- */
- tcp_do_retransmit(sk, 0);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- } else
- tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
- }
- } else
+ if (flag & FLAG_DATA_ACKED)
+ tcp_ack_packets_out(sk, tp);
+ } else {
tcp_clear_xmit_timer(sk, TIME_RETRANS);
+ }
- tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
-
+ flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
+ if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
+ (tp->high_seq != 0)) {
+ tcp_fast_retrans(sk, ack, flag);
+ } else {
+ /* Clear any aborted fast retransmit starts. */
+ tp->dup_acks = 0;
+ }
/* Remember the highest ack received. */
tp->snd_una = ack;
-
return 1;
uninteresting_ack:
-
SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
return 0;
}
+/* New-style handling of TIME_WAIT sockets. */
+static void tcp_timewait_kill(unsigned long __arg)
+{
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg;
+
+ /* Zap the timer. */
+ del_timer(&tw->timer);
+
+ /* Unlink from various places. */
+ if(tw->bind_next)
+ tw->bind_next->bind_pprev = tw->bind_pprev;
+ *(tw->bind_pprev) = tw->bind_next;
+ if(tw->tb->owners == NULL)
+ tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
+
+ if(tw->next)
+ tw->next->pprev = tw->pprev;
+ *tw->pprev = tw->next;
+
+ /* We decremented the prot->inuse count when we entered TIME_WAIT
+ * and the sock from which this came was destroyed.
+ */
+ tw->sklist_next->sklist_prev = tw->sklist_prev;
+ tw->sklist_prev->sklist_next = tw->sklist_next;
+
+ /* Ok, now free it up. */
+ kmem_cache_free(tcp_timewait_cachep, tw);
+}
+
+/* We come here as a special case from the AF specific TCP input processing,
+ * and the SKB has no owner. Essentially handling this is very simple,
+ * we just keep silently eating rx'd packets until none show up for the
+ * entire timeout period. The only special cases are for BSD TIME_WAIT
+ * reconnects and SYN/RST bits being set in the TCP header.
+ */
+int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, void *opt, __u16 len)
+{
+ /* RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+ if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
+ struct sock *sk;
+ struct tcp_func *af_specific = tw->af_specific;
+ __u32 isn;
+
+ isn = tw->rcv_nxt + 128000;
+ if(isn == 0)
+ isn++;
+ tcp_timewait_kill((unsigned long)tw);
+ sk = af_specific->get_sock(skb, th);
+ if(sk == NULL || !ipsec_sk_policy(sk,skb))
+ return 0;
+ skb_set_owner_r(skb, sk);
+ af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ if(af_specific->conn_request(sk, skb, opt, isn) < 0)
+ return 1; /* Toss a reset back. */
+ return 0; /* Discard the frame. */
+ }
+
+ /* Check RST or SYN */
+ if(th->rst || th->syn) {
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if(sysctl_tcp_rfc1337 == 0)
+ tcp_timewait_kill((unsigned long)tw);
+
+ if(!th->rst)
+ return 1; /* toss a reset back */
+ } else {
+ if(th->ack) {
+ /* In this case we must reset the TIMEWAIT timer. */
+ del_timer(&tw->timer);
+ tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+ add_timer(&tw->timer);
+ }
+ }
+ return 0; /* Discard the frame. */
+}
+
+/* Enter the time wait state. This is always called from BH
+ * context. Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+ struct sock **head, *sktw;
+
+ /* Step 1: Remove SK from established hash. */
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ tcp_reg_zap(sk);
+
+ /* Step 2: Put TW into bind hash where SK was. */
+ tw->tb = (struct tcp_bind_bucket *)sk->prev;
+ if((tw->bind_next = sk->bind_next) != NULL)
+ sk->bind_next->bind_pprev = &tw->bind_next;
+ tw->bind_pprev = sk->bind_pprev;
+ *sk->bind_pprev = (struct sock *)tw;
+
+ /* Step 3: Same for the protocol sklist. */
+ (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
+ (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
+ sk->sklist_next = NULL;
+ sk->prot->inuse--;
+
+ /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
+ head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+ sktw = (struct sock *)tw;
+ if((sktw->next = *head) != NULL)
+ (*head)->pprev = &sktw->next;
+ *head = sktw;
+ sktw->pprev = head;
+}
+
+void tcp_time_wait(struct sock *sk)
+{
+ struct tcp_tw_bucket *tw;
+
+ tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+ if(tw != NULL) {
+ /* Give us an identity. */
+ tw->daddr = sk->daddr;
+ tw->rcv_saddr = sk->rcv_saddr;
+ tw->bound_dev_if= sk->bound_dev_if;
+ tw->num = sk->num;
+ tw->state = TCP_TIME_WAIT;
+ tw->family = sk->family;
+ tw->source = sk->dummy_th.source;
+ tw->dest = sk->dummy_th.dest;
+ tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
+ tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if(tw->family == AF_INET6) {
+ memcpy(&tw->v6_daddr,
+ &sk->net_pinfo.af_inet6.daddr,
+ sizeof(struct in6_addr));
+ memcpy(&tw->v6_rcv_saddr,
+ &sk->net_pinfo.af_inet6.rcv_saddr,
+ sizeof(struct in6_addr));
+ }
+#endif
+ /* Linkage updates. */
+ tcp_tw_hashdance(sk, tw);
+
+ /* Get the TIME_WAIT timeout firing. */
+ init_timer(&tw->timer);
+ tw->timer.function = tcp_timewait_kill;
+ tw->timer.data = (unsigned long) tw;
+ tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+ add_timer(&tw->timer);
+
+ /* CLOSE the SK. */
+ if(sk->state == TCP_ESTABLISHED)
+ tcp_statistics.TcpCurrEstab--;
+ sk->state = TCP_CLOSE;
+ net_reset_timer(sk, TIME_DONE,
+ min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
+ } else {
+ /* Sorry, we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ }
+
+ /* Prevent rcvmsg/sndmsg calls, and wake people up. */
+ sk->shutdown = SHUTDOWN_MASK;
+ if(!sk->dead)
+ sk->state_change(sk);
+}
+
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
@@ -976,17 +1042,9 @@ uninteresting_ack:
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- if(sk->state == TCP_SYN_SENT) {
- /* RFC793 says to drop the segment and return. */
- return 1;
- }
-
- /* XXX This fin_seq thing should disappear... -DaveM */
- tp->fin_seq = skb->end_seq;
+ sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;
tcp_send_ack(sk);
@@ -1013,12 +1071,6 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
- case TCP_TIME_WAIT:
- /* Received a retransmission of the FIN,
- * restart the TIME_WAIT timer.
- */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
@@ -1035,21 +1087,15 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- sk->shutdown |= SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_TIME_WAIT);
- break;
- case TCP_CLOSE:
- /* Already in CLOSE. */
+ tcp_time_wait(sk);
break;
default:
- /* Only TCP_LISTEN is left, in that case we should never
- * reach this piece of code.
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
*/
printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
break;
};
- return 0;
}
/* This one checks to see if we can put data from the
@@ -1060,7 +1106,7 @@ static void tcp_ofo_queue(struct sock *sk)
struct sk_buff *skb;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- while ((skb = skb_peek(&sk->out_of_order_queue))) {
+ while ((skb = skb_peek(&tp->out_of_order_queue))) {
if (after(skb->seq, tp->rcv_nxt))
break;
@@ -1076,6 +1122,8 @@ static void tcp_ofo_queue(struct sock *sk)
skb_unlink(skb);
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
}
}
@@ -1094,8 +1142,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
dst_confirm(sk->dst_cache);
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
+ else
+ tp->delayed_acks++;
tcp_ofo_queue(sk);
- if (skb_queue_len(&sk->out_of_order_queue) == 0)
+ if (skb_queue_len(&tp->out_of_order_queue) == 0)
tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
return;
}
@@ -1104,8 +1156,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!after(skb->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an imediate ack. */
SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
-
- tp->delayed_acks = MAX_DELAY_ACK;
+ tcp_enter_quickack_mode(tp);
kfree_skb(skb);
return;
}
@@ -1119,7 +1170,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
}
/* Ok. This is an out_of_order segment, force an ack. */
- tp->delayed_acks = MAX_DELAY_ACK;
+ tp->delayed_acks++;
+ tcp_enter_quickack_mode(tp);
/* Disable header predition. */
tp->pred_flags = 0;
@@ -1127,10 +1179,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, skb->seq, skb->end_seq);
- if (skb_peek(&sk->out_of_order_queue) == NULL) {
- skb_queue_head(&sk->out_of_order_queue,skb);
+ if (skb_peek(&tp->out_of_order_queue) == NULL) {
+ skb_queue_head(&tp->out_of_order_queue,skb);
} else {
- for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+ for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
/* Already there. */
if (skb->seq == skb1->seq && skb->len >= skb1->len) {
skb_append(skb1, skb);
@@ -1145,8 +1197,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
}
/* See if we've hit the start. If so insert. */
- if (skb1 == skb_peek(&sk->out_of_order_queue)) {
- skb_queue_head(&sk->out_of_order_queue,skb);
+ if (skb1 == skb_peek(&tp->out_of_order_queue)) {
+ skb_queue_head(&tp->out_of_order_queue,skb);
break;
}
}
@@ -1172,23 +1224,17 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
if (skb->len == 0 && !th->fin)
return(0);
- /* FIXME: don't accept data after the received fin.
- *
- * Would checking snd_seq against fin_seq be enough?
- * If so, how do we handle that case exactly? -DaveM
- */
-
/* We no longer have anyone receiving data on this connection. */
tcp_data_queue(sk, skb);
- if (before(tp->rcv_nxt, sk->copied_seq)) {
+ if (before(tp->rcv_nxt, tp->copied_seq)) {
printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
- tp->rcv_nxt = sk->copied_seq;
+ tp->rcv_nxt = tp->copied_seq;
}
- tp->delayed_acks++;
-
- /* Now tell the user we may have some data. */
+ /* Above, tcp_data_queue() increments delayed_acks appropriately.
+ * Now tell the user we may have some data.
+ */
if (!sk->dead) {
SOCK_DEBUG(sk, "Data wakeup.\n");
sk->data_ready(sk,0);
@@ -1204,23 +1250,10 @@ static void tcp_data_snd_check(struct sock *sk)
if ((skb = tp->send_head)) {
if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
tp->packets_out < tp->snd_cwnd ) {
- /* Add more data to the send queue. */
-
- /* FIXME: the congestion window is checked
- * again in tcp_write_xmit anyway?! -- erics
- *
- * I think it must, it bumps tp->packets_out for
- * each packet it fires onto the wire. -DaveM
- */
+ /* Put more data onto the wire. */
tcp_write_xmit(sk);
- if(!sk->dead)
- sk->write_space(sk);
} else if (tp->packets_out == 0 && !tp->pending) {
- /* Data to queue but no room. */
-
- /* FIXME: Is it right to do a zero window probe into
- * a congestion window limited window??? -- erics
- */
+ /* Start probing the receivers window. */
tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
}
}
@@ -1240,12 +1273,24 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
* - delay time <= 0.5 HZ
* - we don't have a window update to send
* - must send at least every 2 full sized packets
+ *
+ * With an extra heuristic to handle loss of packet
+ * situations and also helping the sender leave slow
+ * start in an expediant manner.
*/
- if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
+ /* Two full frames received or... */
+ if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) ||
+ /* We will update the window "significantly" or... */
+ tcp_raise_window(sk) ||
+ /* We entered "quick ACK" mode */
+ tcp_in_quickack_mode(tp)) {
+ /* Then ack it now */
tcp_send_ack(sk);
- else
- tcp_send_delayed_ack(sk, HZ/2);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(tp, HZ/2);
+ }
}
static __inline__ void tcp_ack_snd_check(struct sock *sk)
@@ -1279,11 +1324,11 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
ptr += ntohl(th->seq);
/* Ignore urgent data that we've already seen and read. */
- if (after(sk->copied_seq, ptr))
+ if (after(tp->copied_seq, ptr))
return;
/* Do we already have a newer (or duplicate) urgent pointer? */
- if (sk->urg_data && !after(ptr, sk->urg_seq))
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
/* Tell the world about our new urgent pointer. */
@@ -1296,14 +1341,14 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
- * sk->copied_seq since we would read the last urgent byte again
+ * tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark())
*/
- if (sk->urg_seq == sk->copied_seq)
- sk->copied_seq++; /* Move the copied sequence on correctly */
- sk->urg_data = URG_NOTYET;
- sk->urg_seq = ptr;
+ if (tp->urg_seq == tp->copied_seq)
+ tp->copied_seq++; /* Move the copied sequence on correctly */
+ tp->urg_data = URG_NOTYET;
+ tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -1312,17 +1357,19 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* This is the 'fast' part of urgent handling. */
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
tcp_check_urg(sk,th);
/* Do we wait for any urgent data? - normally not... */
- if (sk->urg_data == URG_NOTYET) {
- u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4);
+ if (tp->urg_data == URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
/* Is the urgent pointer pointing into this packet? */
if (ptr < len) {
- sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+ tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
if (!sk->dead)
sk->data_ready(sk,0);
}
@@ -1335,33 +1382,39 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
*/
static void prune_queue(struct sock *sk)
{
- struct tcp_opt *tp;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct sk_buff * skb;
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq);
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
/* First Clean the out_of_order queue. */
/* Start with the end because there are probably the least
* useful packets (crossing fingers).
*/
- while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) {
+ while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) {
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
return;
}
- tp = &sk->tp_pinfo.af_tcp;
-
/* Now continue with the receive queue if it wasn't enough */
while ((skb = skb_peek_tail(&sk->receive_queue))) {
+ /* Never toss anything when we've seen the FIN.
+ * It's just too complex to recover from it.
+ */
+ if(skb->h.th->fin)
+ break;
+
/* Never remove packets that have been already acked */
if (before(skb->end_seq, tp->last_ack_sent+1)) {
printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
- sk->copied_seq, skb->end_seq, tp->last_ack_sent);
+ tp->copied_seq, skb->end_seq, tp->last_ack_sent);
break;
}
skb_unlink(skb);
tp->rcv_nxt = skb->seq;
+ SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
+ skb->seq, skb->end_seq, tp->copied_seq);
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
break;
@@ -1429,7 +1482,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
} else if (skb->ack_seq == tp->snd_una) {
/* Bulk data transfer: receiver */
-
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
goto discard;
@@ -1441,18 +1493,13 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ /* FIN bit check is not done since if FIN is set in
+ * this frame, the pred_flags won't match up. -DaveM
+ */
sk->data_ready(sk, 0);
tcp_delack_estimator(tp);
-
-#if 1 /* This checks for required window updates too. */
tp->delayed_acks++;
__tcp_ack_snd_check(sk);
-#else
- if (tp->delayed_acks++ == 0)
- tcp_send_delayed_ack(sk, HZ/2);
- else
- tcp_send_ack(sk);
-#endif
return 0;
}
}
@@ -1469,7 +1516,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
}
- if(th->syn && skb->seq != sk->syn_seq) {
+ if(th->syn && skb->seq != tp->syn_seq) {
SOCK_DEBUG(sk, "syn in established state\n");
tcp_statistics.TcpInErrs++;
tcp_reset(sk, skb);
@@ -1490,10 +1537,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* step 7: process the segment text */
queued = tcp_data(skb, sk, len);
- /* step 8: check the FIN bit */
- if (th->fin)
- (void) tcp_fin(skb, sk, th);
-
tcp_data_snd_check(sk);
/* If our receive queue has grown past its limits shrink it */
@@ -1657,19 +1700,19 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_wnd = htons(th->window) << tp->snd_wscale;
tp->snd_wl1 = skb->seq;
tp->snd_wl2 = skb->ack_seq;
-
tp->fin_seq = skb->seq;
tcp_set_state(sk, TCP_ESTABLISHED);
tcp_parse_options(th,tp,0);
- /* FIXME: need to make room for SACK still */
+
if (tp->wscale_ok == 0) {
tp->snd_wscale = tp->rcv_wscale = 0;
tp->window_clamp = min(tp->window_clamp,65535);
}
if (tp->tstamp_ok) {
- tp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: Define constant! */
- sk->dummy_th.doff += 3; /* reserve space of options */
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
} else
tp->tcp_header_len = sizeof(struct tcphdr);
if (tp->saw_tstamp) {
@@ -1680,14 +1723,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Can't be earlier, doff would be wrong. */
tcp_send_ack(sk);
- if (tp->in_mss)
- sk->mss = min(sk->mss, tp->in_mss);
-
- /* Take out space for tcp options. */
- sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr);
+ /* Check for the case where we tried to advertise
+ * a window including timestamp options, but did not
+ * end up using them for this connection.
+ */
+ if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
+ sk->mss += TCPOLEN_TSTAMP_ALIGNED;
+ /* Now limit it if the other end negotiated a smaller
+ * value.
+ */
+ if (tp->in_mss) {
+ int real_mss = tp->in_mss;
+
+ /* We store MSS locally with the timestamp bytes
+ * subtracted, TCP's advertise it with them
+ * included. Account for this fact.
+ */
+ if(tp->tstamp_ok)
+ real_mss -= TCPOLEN_TSTAMP_ALIGNED;
+ sk->mss = min(sk->mss, real_mss);
+ }
+
sk->dummy_th.dest = th->source;
- sk->copied_seq = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
if(!sk->dead) {
sk->state_change(sk);
@@ -1722,52 +1781,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
break;
-
- case TCP_TIME_WAIT:
- /* RFC 1122:
- * "When a connection is [...] on TIME-WAIT state [...]
- * [a TCP] MAY accept a new SYN from the remote TCP to
- * reopen the connection directly, if it:
- *
- * (1) assigns its initial sequence number for the new
- * connection to be larger than the largest sequence
- * number it used on the previous connection incarnation,
- * and
- *
- * (2) returns to TIME-WAIT state if the SYN turns out
- * to be an old duplicate".
- */
- if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) {
- __u32 isn;
-
- skb_orphan(skb);
- sk->err = ECONNRESET;
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
-
- isn = tp->rcv_nxt + 128000;
- if (isn == 0)
- isn++;
-
- sk = tp->af_specific->get_sock(skb, th);
-
- if (sk == NULL || !ipsec_sk_policy(sk,skb))
- goto discard;
-
- skb_set_owner_r(skb, sk);
- tp = &sk->tp_pinfo.af_tcp;
-
- if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
- return 1;
-
- goto discard;
- }
-
- break;
}
/* Parse the tcp_options present on this header.
- * By this point we really only expect timestamps and SACKs.
+ * By this point we really only expect timestamps.
* Note that this really has to be here and not later for PAWS
* (RFC1323) to work.
*/
@@ -1819,7 +1836,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* original syn.
*/
- if (th->syn && skb->seq!=sk->syn_seq) {
+ if (th->syn && skb->seq!=tp->syn_seq) {
tcp_reset(sk, skb);
return 1;
}
@@ -1833,7 +1850,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (acceptable) {
tcp_set_state(sk, TCP_ESTABLISHED);
sk->dummy_th.dest=th->source;
- sk->copied_seq = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
if(!sk->dead)
sk->state_change(sk);
@@ -1850,7 +1867,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
break;
case TCP_FIN_WAIT1:
- if (tp->snd_una == sk->write_seq) {
+ if (tp->snd_una == tp->write_seq) {
sk->shutdown |= SEND_SHUTDOWN;
tcp_set_state(sk, TCP_FIN_WAIT2);
if (!sk->dead)
@@ -1861,12 +1878,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
break;
case TCP_CLOSING:
- if (tp->snd_una == sk->write_seq)
+ if (tp->snd_una == tp->write_seq)
tcp_time_wait(sk);
break;
case TCP_LAST_ACK:
- if (tp->snd_una == sk->write_seq) {
+ if (tp->snd_una == tp->write_seq) {
sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
if (!sk->dead)
@@ -1874,13 +1891,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
}
break;
-
- case TCP_TIME_WAIT:
- /* Keep us in TIME_WAIT until we stop getting
- * packets, reset the timeout.
- */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
}
} else
goto discard;
@@ -1918,12 +1928,6 @@ step6:
break;
}
- /* step 8: check the FIN bit */
- if (th->fin) {
- if(tcp_fin(skb, sk, th) != 0)
- goto discard;
- }
-
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e4f8981ac..91f21ff75 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $
*
* IPv4 specific functions
*
@@ -60,8 +60,6 @@
#include <linux/inet.h>
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_syncookies;
@@ -89,16 +87,19 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
*/
struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
/* All sockets in TCP_LISTEN state will be in here. This is the only table
* where wildcard'd TCP sockets can exist. Hash function here is just local
* port number.
*/
struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
-/* Ok, let's try this, I give up, we do need a local binding
- * TCP hash as well as the others for fast bind/connect.
- */
-struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
+/* Register cache. */
+struct sock *tcp_regs[TCP_NUM_REGS];
/*
* This array holds the first and last local port number.
@@ -106,6 +107,7 @@ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
* 32768-61000
*/
int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = (1024 - 1);
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
@@ -123,155 +125,135 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk)
return tcp_hashfn(laddr, lport, faddr, fport);
}
-static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
+/* Invariant, sk->num is non-zero. */
+void tcp_bucket_unlock(struct sock *sk)
{
- struct sock *sk2;
- int retval = 0, sk_reuse = sk->reuse;
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
SOCKHASH_LOCK();
- sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
- for(; sk2 != NULL; sk2 = sk2->bind_next) {
- if((sk2->num == snum) && (sk2 != sk)) {
- unsigned char state = sk2->state;
- int sk2_reuse = sk2->reuse;
-
- /* Two sockets can be bound to the same port if they're
- * bound to different interfaces.
- */
-
- if(sk->bound_dev_if != sk2->bound_dev_if)
- continue;
-
- if(!sk2->rcv_saddr || !sk->rcv_saddr) {
- if((!sk2_reuse) ||
- (!sk_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
- }
- } else if(sk2->rcv_saddr == sk->rcv_saddr) {
- if((!sk_reuse) ||
- (!sk2_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
- }
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+ if(tb->port == snum) {
+ if(tb->owners == NULL &&
+ (tb->flags & TCPB_FLAG_LOCKED)) {
+ tb->flags &= ~TCPB_FLAG_LOCKED;
+ tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
}
+ break;
}
}
SOCKHASH_UNLOCK();
+}
- return retval;
+struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
+{
+ struct tcp_bind_bucket *tb;
+
+ tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
+ if(tb != NULL) {
+ struct tcp_bind_bucket **head =
+ &tcp_bound_hash[tcp_bhashfn(snum)];
+ tb->port = snum;
+ tb->flags = TCPB_FLAG_LOCKED;
+ tb->owners = NULL;
+ if((tb->next = *head) != NULL)
+ tb->next->pprev = &tb->next;
+ *head = tb;
+ tb->pprev = head;
+ }
+ return tb;
}
-static __inline__ int tcp_lport_inuse(int num)
+static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
{
- struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
+ struct tcp_bind_bucket *tb;
+ int result = 0;
- for(; sk != NULL; sk = sk->bind_next) {
- if(sk->num == num)
- return 1;
+ SOCKHASH_LOCK();
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ (tb && (tb->port != snum));
+ tb = tb->next)
+ ;
+ if(tb && tb->owners) {
+ /* Fast path for reuse ports, see include/net/tcp.h for a very
+ * detailed description of why this works, and why it is worth
+ * the effort at all. -DaveM
+ */
+ if((tb->flags & TCPB_FLAG_FASTREUSE) &&
+ (sk->reuse != 0)) {
+ goto go_like_smoke;
+ } else {
+ struct sock *sk2;
+ int sk_reuse = sk->reuse;
+
+ /* We must walk the whole port owner list in this case. -DaveM */
+ for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
+ if(sk->bound_dev_if == sk2->bound_dev_if) {
+ if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
+ if(!sk2->rcv_saddr ||
+ !sk->rcv_saddr ||
+ (sk2->rcv_saddr == sk->rcv_saddr))
+ break;
+ }
+ }
+ }
+ if(sk2 != NULL)
+ result = 1;
+ }
}
- return 0;
+ if((result == 0) &&
+ (tb == NULL) &&
+ (tcp_bucket_create(snum) == NULL))
+ result = 1;
+go_like_smoke:
+ SOCKHASH_UNLOCK();
+ return result;
}
-/* Find a "good" local port, this is family independent.
- * There are several strategies working in unison here to
- * get the best possible performance. The current socket
- * load is kept track of, if it is zero there is a strong
- * likely hood that there is a zero length chain we will
- * find with a small amount of searching, else the load is
- * what we shoot for for when the chains all have at least
- * one entry. The base helps us walk the chains in an
- * order such that a good chain is found as quickly as possible. -DaveM
- */
unsigned short tcp_good_socknum(void)
{
- static int start = 0;
- static int binding_contour = 0;
- int best = 0;
- int size = 32767; /* a big num. */
- int retval = 0, i, end, bc;
+ struct tcp_bind_bucket *tb;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = high - low;
+ int rover;
SOCKHASH_LOCK();
- if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
- start = sysctl_local_port_range[0];
- i = tcp_bhashfn(start);
- end = i + TCP_BHTABLE_SIZE;
- bc = binding_contour;
- do {
- struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
- if(!sk) {
- /* find the smallest value no smaller than start
- * that has this hash value.
- */
- retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
-
- /* Check for decreasing load. */
- if (bc != 0)
- binding_contour = 0;
- goto done;
- } else {
- int j = 0;
- do { sk = sk->bind_next; } while (++j < size && sk);
- if (j < size) {
- best = i&(TCP_BHTABLE_SIZE-1);
- size = j;
- if (bc && size <= bc)
- goto verify;
- }
- }
- } while(++i != end);
- i = best;
-
- /* Socket load is increasing, adjust our load average. */
- binding_contour = size;
-verify:
- if (size < binding_contour)
- binding_contour = size;
-
- retval = tcp_bhashnext(start-1,i);
-
- best = retval; /* mark the starting point to avoid infinite loops */
- while(tcp_lport_inuse(retval)) {
- retval = tcp_bhashnext(retval,i);
- if (retval > sysctl_local_port_range[1]) /* Upper bound */
- retval = tcp_bhashnext(sysctl_local_port_range[0],i);
- if (retval == best) {
- /* This hash chain is full. No answer. */
- retval = 0;
- break;
+ rover = tcp_port_rover;
+ do {
+ rover += 1;
+ if((rover < low) || (rover > high))
+ rover = low;
+ tb = tcp_bound_hash[tcp_bhashfn(rover)];
+ for( ; tb; tb = tb->next) {
+ if(tb->port == rover)
+ goto next;
}
- }
-
-done:
- start = (retval + 1);
+ break;
+ next:
+ } while(--remaining > 0);
+ tcp_port_rover = rover;
+ if((remaining <= 0) || (tcp_bucket_create(rover) == NULL))
+ rover = 0;
SOCKHASH_UNLOCK();
- return retval;
+ return rover;
}
static void tcp_v4_hash(struct sock *sk)
{
- unsigned char state;
-
- SOCKHASH_LOCK();
- state = sk->state;
- if(state != TCP_CLOSE || !sk->dead) {
+ if (sk->state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN)
- skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- else
- skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
-
+ SOCKHASH_LOCK();
+ skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
tcp_sk_bindify(sk);
+ SOCKHASH_UNLOCK();
}
- SOCKHASH_UNLOCK();
}
static void tcp_v4_unhash(struct sock *sk)
@@ -282,6 +264,7 @@ static void tcp_v4_unhash(struct sock *sk)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
+ tcp_reg_zap(sk);
tcp_sk_unbindify(sk);
}
SOCKHASH_UNLOCK();
@@ -293,30 +276,27 @@ static void tcp_v4_rehash(struct sock *sk)
SOCKHASH_LOCK();
state = sk->state;
- if(sk->pprev) {
+ if(sk->pprev != NULL) {
if(sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
- tcp_sk_unbindify(sk);
+ tcp_reg_zap(sk);
}
- if(state != TCP_CLOSE || !sk->dead) {
+ if(state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN) {
+ if(state == TCP_LISTEN)
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- } else {
- int hash= tcp_sk_hashfn(sk);
- if(state == TCP_TIME_WAIT)
- hash += (TCP_HTABLE_SIZE/2);
- skp = &tcp_established_hash[hash];
- }
+ else
+ skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
- tcp_sk_bindify(sk);
+ if(state == TCP_LISTEN)
+ tcp_sk_bindify(sk);
}
SOCKHASH_UNLOCK();
}
@@ -360,37 +340,64 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d
return result;
}
+/* Until this is verified... -DaveM */
+/* #define USE_QUICKSYNS */
+
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ * It is assumed that this code only gets called from within NET_BH.
*/
static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
- u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+ u32 saddr, u16 sport,
+ u32 daddr, u16 dport, int dif)
{
unsigned short hnum = ntohs(dport);
struct sock *sk;
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ int hash;
+
+#ifdef USE_QUICKSYNS
+ /* Incomming connection short-cut. */
+ if (th && th->syn == 1 && th->ack == 0)
+ goto listener_shortcut;
+#endif
+
+ /* Check TCP register quick cache first. */
+ sk = TCP_RHASH(sport);
+ if(sk &&
+ sk->daddr == saddr && /* remote address */
+ sk->dummy_th.dest == sport && /* remote port */
+ sk->num == hnum && /* local port */
+ sk->rcv_saddr == daddr && /* local address */
+ (!sk->bound_dev_if || sk->bound_dev_if == dif))
+ goto hit;
/* Optimize here for direct hit, only listening connections can
- * have wildcards anyways. It is assumed that this code only
- * gets called from within NET_BH.
+ * have wildcards anyways.
*/
- for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
+ hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
if(sk->daddr == saddr && /* remote address */
sk->dummy_th.dest == sport && /* remote port */
sk->num == hnum && /* local port */
sk->rcv_saddr == daddr && /* local address */
- (!sk->bound_dev_if || sk->bound_dev_if == dif))
+ (!sk->bound_dev_if || sk->bound_dev_if == dif)) {
+ if (sk->state == TCP_ESTABLISHED)
+ TCP_RHASH(sport) = sk;
goto hit; /* You sunk my battleship! */
-
+ }
+ }
/* Must check for a TIME_WAIT'er before going to listener hash. */
- for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) {
if(sk->daddr == saddr && /* remote address */
sk->dummy_th.dest == sport && /* remote port */
sk->num == hnum && /* local port */
sk->rcv_saddr == daddr && /* local address */
(!sk->bound_dev_if || sk->bound_dev_if == dif))
goto hit;
-
+ }
+#ifdef USE_QUICKSYNS
+listener_shortcut:
+#endif
sk = tcp_v4_lookup_listener(daddr, hnum, dif);
hit:
return sk;
@@ -402,20 +409,11 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
-#define secondlist(hpnum, sk, fpass) \
-({ struct sock *s1; if(!(sk) && (fpass)--) \
- s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
- else \
- s1 = (sk); \
- s1; \
-})
-
-#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
- secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
-
-#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
- secondlist((hpnum),(sk)->bind_next,(fpass))
-
+/* Cleaned up a little and adapted to new bind bucket scheme.
+ * Oddly, this should increase performance here for
+ * transparent proxy, as tests within the inner loop have
+ * been eliminated. -DaveM
+ */
static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
unsigned short rnum, unsigned long laddr,
struct device *dev, unsigned short pnum,
@@ -436,51 +434,60 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
}
/* This code must run only from NET_BH. */
- for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
- s != NULL;
- s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
- if(s->num == hnum || s->num == hpnum) {
- int score = 0;
- if(s->dead && (s->state == TCP_CLOSE))
+ {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+ for( ; (tb && tb->port != hnum); tb = tb->next)
+ ;
+ if(tb == NULL)
+ goto next;
+ s = tb->owners;
+ }
+pass2:
+ for(; s; s = s->bind_next) {
+ int score = 0;
+ if(s->rcv_saddr) {
+ if((s->num != hpnum || s->rcv_saddr != paddr) &&
+ (s->num != hnum || s->rcv_saddr != laddr))
continue;
- if(s->rcv_saddr) {
- if((s->num != hpnum || s->rcv_saddr != paddr) &&
- (s->num != hnum || s->rcv_saddr != laddr))
- continue;
- score++;
- }
- if(s->daddr) {
- if(s->daddr != raddr)
- continue;
- score++;
- }
- if(s->dummy_th.dest) {
- if(s->dummy_th.dest != rnum)
- continue;
- score++;
- }
- if(s->bound_dev_if) {
- if(s->bound_dev_if != dif)
- continue;
- score++;
- }
- if(score == 4 && s->num == hnum) {
- result = s;
- break;
- } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
- result = s;
- badness = score;
- }
+ score++;
+ }
+ if(s->daddr) {
+ if(s->daddr != raddr)
+ continue;
+ score++;
+ }
+ if(s->dummy_th.dest) {
+ if(s->dummy_th.dest != rnum)
+ continue;
+ score++;
+ }
+ if(s->bound_dev_if) {
+ if(s->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if(score == 4 && s->num == hnum) {
+ result = s;
+ goto gotit;
+ } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+ result = s;
+ badness = score;
}
}
+next:
+ if(firstpass--) {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+ for( ; (tb && tb->port != hpnum); tb = tb->next)
+ ;
+ if(tb) {
+ s = tb->owners;
+ goto pass2;
+ }
+ }
+gotit:
return result;
}
-
-#undef secondlist
-#undef tcp_v4_proxy_loop_init
-#undef tcp_v4_proxy_loop_next
-
-#endif
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
@@ -495,41 +502,35 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
/*
* Check that a TCP address is unique, don't allow multiple
- * connects to/from the same address
+ * connects to/from the same address. Actually we can optimize
+ * quite a bit, since the socket about to connect is still
+ * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
+ * use will exist, with a NULL owners list. So check for that.
+ * The good_socknum and verify_bind scheme we use makes this
+ * work.
*/
-static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
+static int tcp_unique_address(struct sock *sk)
{
- int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
- struct sock * sk;
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
+ int retval = 1;
- /* Make sure we are allowed to connect here.
- * But freeze the hash while we snoop around.
- */
+ /* Freeze the hash while we snoop around. */
SOCKHASH_LOCK();
- sk = tcp_established_hash[hashent];
- for (; sk != NULL; sk = sk->next) {
- if(sk->daddr == daddr && /* remote address */
- sk->dummy_th.dest == dnum && /* remote port */
- sk->num == snum && /* local port */
- sk->saddr == saddr) { /* local address */
- retval = 0;
- goto out;
- }
- }
-
- /* Must check TIME_WAIT'ers too. */
- sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)];
- for (; sk != NULL; sk = sk->next) {
- if(sk->daddr == daddr && /* remote address */
- sk->dummy_th.dest == dnum && /* remote port */
- sk->num == snum && /* local port */
- sk->saddr == saddr) { /* local address */
- retval = 0;
- goto out;
+ tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ for(; tb; tb = tb->next) {
+ if(tb->port == snum && tb->owners != NULL) {
+ /* Almost certainly the re-use port case, search the real hashes
+ * so it actually scales.
+ */
+ sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest,
+ sk->rcv_saddr, snum, sk->bound_dev_if);
+ if((sk != NULL) && (sk->state != TCP_LISTEN))
+ retval = 0;
+ break;
}
}
-out:
SOCKHASH_UNLOCK();
return retval;
}
@@ -578,8 +579,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -ENETUNREACH;
}
- if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
- usin->sin_port)) {
+ if (!tcp_unique_address(sk)) {
ip_rt_put(rt);
return -EADDRNOTAVAIL;
}
@@ -587,7 +587,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
lock_sock(sk);
/* Do this early, so there is less state to unwind on failure. */
- buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
+ buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)),
+ 0, GFP_KERNEL);
if (buff == NULL) {
release_sock(sk);
ip_rt_put(rt);
@@ -605,15 +606,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->dummy_th.dest = usin->sin_port;
- sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
sk->dummy_th.source,
usin->sin_port);
-
tp->snd_wnd = 0;
tp->snd_wl1 = 0;
- tp->snd_wl2 = sk->write_seq;
- tp->snd_una = sk->write_seq;
-
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
tp->rcv_nxt = 0;
sk->err = 0;
@@ -635,14 +634,22 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* No failure conditions can result past this point. */
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
buff->h.th = th;
memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
- buff->seq = sk->write_seq++;
+ /* th->doff gets fixed up below if we tack on options. */
+
+ buff->seq = tp->write_seq++;
th->seq = htonl(buff->seq);
- tp->snd_nxt = sk->write_seq;
- buff->end_seq = sk->write_seq;
+ tp->snd_nxt = tp->write_seq;
+ buff->end_seq = tp->write_seq;
th->ack = 0;
th->syn = 1;
@@ -656,11 +663,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if(sk->mtu < 64)
sk->mtu = 64; /* Sanity limit */
- if (sk->user_mss)
- sk->mss = sk->user_mss;
- else
- sk->mss = (sk->mtu - sizeof(struct iphdr) -
- sizeof(struct tcphdr));
+ sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len);
+ if(sk->user_mss)
+ sk->mss = min(sk->mss, sk->user_mss);
if (sk->mss < 1) {
printk(KERN_DEBUG "intial sk->mss below 1\n");
@@ -675,9 +680,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
&tp->rcv_wscale);
th->window = htons(tp->rcv_wnd);
- tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
- sysctl_tcp_timestamps,
- sysctl_tcp_window_scaling,tp->rcv_wscale);
+ tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps,
+ sysctl_tcp_window_scaling, tp->rcv_wscale);
buff->csum = 0;
th->doff = (sizeof(*th)+ tmp)>>2;
@@ -686,9 +690,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tcp_set_state(sk,TCP_SYN_SENT);
/* Socket identity change complete, no longer
- * in TCP_CLOSE, so rehash.
+ * in TCP_CLOSE, so enter ourselves into the
+ * hash tables.
*/
- tcp_v4_rehash(sk);
+ tcp_v4_hash(sk);
tp->rto = rt->u.dst.rtt;
@@ -715,6 +720,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
{
+ struct tcp_opt *tp;
int retval = -EINVAL;
/* Do sanity checking for sendmsg/sendto/send. */
@@ -740,7 +746,10 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
lock_sock(sk);
retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
msg->msg_flags);
-
+ /* Push out partial tail frames if needed. */
+ tp = &(sk->tp_pinfo.af_tcp);
+ if(tp->send_head && tcp_snd_test(sk, tp->send_head))
+ tcp_write_xmit(sk);
release_sock(sk);
out:
@@ -854,7 +863,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
th = (struct tcphdr*)(dp+(iph->ihl<<2));
sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
- if (sk == NULL) {
+ if (sk == NULL || sk->state == TCP_TIME_WAIT) {
icmp_statistics.IcmpInErrors++;
return;
}
@@ -1011,7 +1020,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
skb1->nh.iph->daddr, skb1->csum);
- /* FIXME: should this carry an options packet? */
+
+ /* Do not place TCP options in a reset. */
ip_queue_xmit(skb1);
tcp_statistics.TcpOutSegs++;
tcp_statistics.TcpOutRsts++;
@@ -1063,6 +1073,14 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
if (sk->user_mss)
mss = min(mss, sk->user_mss);
+ if(req->tstamp_ok)
+ mss -= TCPOLEN_TSTAMP_ALIGNED;
+ else
+ req->mss += TCPOLEN_TSTAMP_ALIGNED;
+
+ /* tcp_syn_build_options will do an skb_put() to obtain the TCP
+ * options bytes below.
+ */
skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
/* Don't offer more than they did.
@@ -1081,9 +1099,8 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- th->source =
#ifdef CONFIG_IP_TRANSPARENT_PROXY
- req->lcl_port; /* LVE */
+ th->source = req->lcl_port; /* LVE */
#else
th->source = sk->dummy_th.source;
#endif
@@ -1104,16 +1121,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
req->rcv_wscale = rcv_wscale;
}
th->window = htons(req->rcv_wnd);
-
- /* XXX Partial csum of 4 byte quantity is itself! -DaveM
- * Yes, but it's a bit harder to special case now. It's
- * now computed inside the tcp_v4_send_check() to clean up
- * updating the options fields in the mainline send code.
- * If someone thinks this is really bad let me know and
- * I'll try to do it a different way. -- erics
- */
-
- tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
+ tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok,
req->wscale_ok,req->rcv_wscale);
skb->csum = 0;
th->doff = (sizeof(*th) + tmp)>>2;
@@ -1232,14 +1240,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->rcv_isn = skb->seq;
- tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+ tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0;
tp.in_mss = 536;
tcp_parse_options(th,&tp,want_cookie);
- if (tp.saw_tstamp)
- req->ts_recent = tp.rcv_tsval;
req->mss = tp.in_mss;
+ if (tp.saw_tstamp) {
+ req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+ req->ts_recent = tp.rcv_tsval;
+ }
req->tstamp_ok = tp.tstamp_ok;
- req->sack_ok = tp.sack_ok;
req->snd_wscale = tp.snd_wscale;
req->wscale_ok = tp.wscale_ok;
req->rmt_port = th->source;
@@ -1289,6 +1298,113 @@ error:
return 0;
}
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0);
+
+ if(newsk != NULL) {
+ struct tcp_opt *newtp;
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->sklist_next = NULL;
+ newsk->daddr = req->af.v4_req.rmt_addr;
+ newsk->rcv_saddr = req->af.v4_req.loc_addr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ newsk->num = ntohs(skb->h.th->dest);
+#endif
+ newsk->state = TCP_SYN_RECV;
+
+ /* Clone the TCP header template */
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ newsk->dummy_th.source = req->lcl_port;
+#endif
+ newsk->dummy_th.dest = req->rmt_port;
+ newsk->dummy_th.ack = 1;
+ newsk->dummy_th.doff = sizeof(struct tcphdr)>>2;
+
+ newsk->sock_readers = 0;
+ atomic_set(&newsk->rmem_alloc, 0);
+ skb_queue_head_init(&newsk->receive_queue);
+ atomic_set(&newsk->wmem_alloc, 0);
+ skb_queue_head_init(&newsk->write_queue);
+ newsk->saddr = req->af.v4_req.loc_addr;
+
+ newsk->done = 0;
+ newsk->proc = 0;
+ newsk->pair = NULL;
+ skb_queue_head_init(&newsk->back_log);
+ skb_queue_head_init(&newsk->error_queue);
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->srtt = 0;
+ newtp->ato = 0;
+ newtp->snd_wl1 = req->rcv_isn;
+ newtp->snd_wl2 = req->snt_isn;
+ newtp->snd_wnd = ntohs(skb->h.th->window);
+ newtp->max_window = newtp->snd_wnd;
+ newtp->pending = 0;
+ newtp->retransmits = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
+ newtp->backoff = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->snd_cwnd = 1;
+ newtp->rto = TCP_TIMEOUT_INIT;
+ newtp->packets_out = 0;
+ newtp->high_seq = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+ newtp->snd_cwnd_cnt = 0;
+ newtp->dup_acks = 0;
+ newtp->delayed_acks = 0;
+ init_timer(&newtp->retransmit_timer);
+ newtp->retransmit_timer.function = &tcp_retransmit_timer;
+ newtp->retransmit_timer.data = (unsigned long) newsk;
+ init_timer(&newtp->delack_timer);
+ newtp->delack_timer.function = &tcp_delack_timer;
+ newtp->delack_timer.data = (unsigned long) newsk;
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->send_head = newtp->retrans_head = NULL;
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->saw_tstamp = 0;
+ newtp->in_mss = 536;
+
+ init_timer(&newtp->probe_timer);
+ newtp->probe_timer.function = &tcp_probe_timer;
+ newtp->probe_timer.data = (unsigned long) newsk;
+ newtp->probes_out = 0;
+ newtp->syn_seq = req->rcv_isn;
+ newtp->fin_seq = req->rcv_isn;
+ newtp->urg_data = 0;
+ tcp_synq_init(newtp);
+ newtp->syn_backlog = 0;
+
+ /* Back to base struct sock members. */
+ newsk->err = 0;
+ newsk->ack_backlog = 0;
+ newsk->max_ack_backlog = SOMAXCONN;
+ newsk->priority = 1;
+
+ /* IP layer stuff */
+ newsk->opt = req->af.v4_req.opt;
+ newsk->timeout = 0;
+ init_timer(&newsk->timer);
+ newsk->timer.function = &net_timer;
+ newsk->timer.data = (unsigned long) newsk;
+ newsk->socket = NULL;
+ }
+ return newsk;
+}
+
struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct open_request *req,
struct dst_entry *dst)
@@ -1301,98 +1417,14 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk->ack_backlog > sk->max_ack_backlog)
goto exit; /* head drop */
#endif
- newsk = sk_alloc(AF_INET, GFP_ATOMIC);
+ newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
#ifdef NEW_LISTEN
sk->ack_backlog++;
#endif
- memcpy(newsk, sk, sizeof(*newsk));
-
- /* Or else we die! -DaveM */
- newsk->sklist_next = NULL;
-
- newsk->opt = req->af.v4_req.opt;
- skb_queue_head_init(&newsk->write_queue);
- skb_queue_head_init(&newsk->receive_queue);
- skb_queue_head_init(&newsk->out_of_order_queue);
- skb_queue_head_init(&newsk->error_queue);
-
- /* Unused */
newtp = &(newsk->tp_pinfo.af_tcp);
- newtp->send_head = NULL;
- newtp->retrans_head = NULL;
-
- newtp->pending = 0;
-
- skb_queue_head_init(&newsk->back_log);
-
- newsk->prot->init(newsk);
-
- newtp->snd_cwnd_cnt = 0;
- newtp->backoff = 0;
- newsk->proc = 0;
- newsk->done = 0;
- newsk->pair = NULL;
- atomic_set(&newsk->wmem_alloc, 0);
- atomic_set(&newsk->rmem_alloc, 0);
- newsk->localroute = sk->localroute;
-
- newsk->err = 0;
- newsk->shutdown = 0;
- newsk->ack_backlog = 0;
-
- newtp->fin_seq = req->rcv_isn;
- newsk->syn_seq = req->rcv_isn;
- newsk->state = TCP_SYN_RECV;
- newsk->timeout = 0;
-
- newsk->write_seq = req->snt_isn;
-
- newtp->snd_wnd = ntohs(skb->h.th->window);
- newtp->max_window = newtp->snd_wnd;
- newtp->snd_wl1 = req->rcv_isn;
- newtp->snd_wl2 = newsk->write_seq;
- newtp->snd_una = newsk->write_seq++;
- newtp->snd_nxt = newsk->write_seq;
-
- newsk->urg_data = 0;
- newtp->packets_out = 0;
- newtp->retransmits = 0;
- newsk->linger=0;
- newsk->destroy = 0;
- init_timer(&newsk->timer);
- newsk->timer.data = (unsigned long) newsk;
- newsk->timer.function = &net_timer;
-
- tcp_init_xmit_timers(newsk);
-
- newsk->dummy_th.source =
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- req->lcl_port; /* LVE */
-#else
- sk->dummy_th.source;
-#endif
- newsk->dummy_th.dest = req->rmt_port;
- newsk->sock_readers=0;
-
- newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1;
- newtp->rcv_wup = req->rcv_isn + 1;
- newsk->copied_seq = req->rcv_isn + 1;
-
- newsk->socket = NULL;
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /*
- * Deal with possibly redirected traffic by setting num to
- * the intended destination port of the received packet.
- */
- newsk->num = ntohs(skb->h.th->dest);
-#endif
- newsk->daddr = req->af.v4_req.rmt_addr;
- newsk->saddr = req->af.v4_req.loc_addr;
- newsk->rcv_saddr = req->af.v4_req.loc_addr;
/* options / mss / route_cache */
if (dst == NULL) {
@@ -1418,7 +1450,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newsk->mtu < 64)
newsk->mtu = 64;
- newtp->sack_ok = req->sack_ok;
newtp->tstamp_ok = req->tstamp_ok;
newtp->window_clamp = req->window_clamp;
newtp->rcv_wnd = req->rcv_wnd;
@@ -1433,8 +1464,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newtp->tstamp_ok) {
newtp->ts_recent = req->ts_recent;
newtp->ts_recent_stamp = jiffies;
- newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define constant! */
- newsk->dummy_th.doff += 3;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
} else {
newtp->tcp_header_len = sizeof(struct tcphdr);
}
@@ -1446,13 +1477,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
/* Make sure our mtu is adjusted for headers. */
newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
- tcp_v4_hash(newsk);
+ /* Must use the af_specific ops here for the case of IPv6 mapped. */
+ newsk->prot->hash(newsk);
add_to_prot_sklist(newsk);
return newsk;
exit:
- if (dst)
- dst_release(dst);
+ dst_release(dst);
return NULL;
}
@@ -1623,6 +1654,8 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
skb->used = 0;
+ if (sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
if (!sk->sock_readers)
return tcp_v4_do_rcv(sk, skb);
@@ -1636,6 +1669,12 @@ discard_it:
/* Discard frame. */
kfree_skb(skb);
return 0;
+
+do_time_wait:
+ if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, &(IPCB(skb)->opt), skb->len))
+ goto no_tcp_socket;
+ goto discard_it;
}
int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
@@ -1770,33 +1809,21 @@ struct tcp_func ipv4_specific = {
sizeof(struct sockaddr_in)
};
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
static int tcp_v4_init_sock(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- skb_queue_head_init(&sk->out_of_order_queue);
+ skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
- tp->srtt = 0;
tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
tp->mdev = TCP_TIMEOUT_INIT;
-
- tp->ato = 0;
- tp->iat = (HZ/5) << 3;
-
- /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
- /* tp->rcv_wnd = 8192; */
- tp->tstamp_ok = 0;
- tp->sack_ok = 0;
- tp->wscale_ok = 0;
tp->in_mss = 536;
- tp->snd_wscale = 0;
- tp->sacks = 0;
- tp->saw_tstamp = 0;
- tp->syn_backlog = 0;
- /*
- * See draft-stevens-tcpca-spec-01 for discussion of the
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_cwnd = 1;
@@ -1804,9 +1831,7 @@ static int tcp_v4_init_sock(struct sock *sk)
sk->priority = 1;
sk->state = TCP_CLOSE;
-
sk->max_ack_backlog = SOMAXCONN;
-
sk->mtu = 576;
sk->mss = 536;
@@ -1824,6 +1849,7 @@ static int tcp_v4_init_sock(struct sock *sk)
static int tcp_v4_destroy_sock(struct sock *sk)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
tcp_clear_xmit_timers(sk);
@@ -1836,9 +1862,17 @@ static int tcp_v4_destroy_sock(struct sock *sk)
kfree_skb(skb);
/* Cleans up our, hopefuly empty, out_of_order_queue. */
- while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
+ while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL)
kfree_skb(skb);
+ /* Clean up a locked TCP bind bucket, this only happens if a
+ * port is allocated for a socket, but it never fully connects.
+ * In which case we will find num to be non-zero and daddr to
+ * be zero.
+ */
+ if(sk->daddr == 0 && sk->num != 0)
+ tcp_bucket_unlock(sk);
+
return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fbae5cfa6..d8c3c6480 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.51 1998/01/15 22:40:39 freitag Exp $
+ * Version: $Id: tcp_output.c,v 1.65 1998/03/15 12:07:03 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -34,8 +34,6 @@
#include <net/tcp.h>
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
@@ -45,7 +43,8 @@ static __inline__ void clear_delayed_acks(struct sock * sk)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
tp->delayed_acks = 0;
- sk->ack_backlog = 0;
+ if(tcp_in_quickack_mode(tp))
+ tp->ato = ((HZ/100)*2);
tcp_clear_xmit_timer(sk, TIME_DACK);
}
@@ -58,69 +57,26 @@ static __inline__ void update_send_head(struct sock *sk)
tp->send_head = NULL;
}
-static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int nagle_check = 1;
- int len;
-
- /* RFC 1122 - section 4.2.3.4
- *
- * We must queue if
- *
- * a) The right edge of this frame exceeds the window
- * b) There are packets in flight and we have a small segment
- * [SWS avoidance and Nagle algorithm]
- * (part of SWS is done on packetization)
- * c) We are retransmiting [Nagle]
- * d) We have too many packets 'in flight'
- *
- * Don't use the nagle rule for urgent data.
- */
- len = skb->end_seq - skb->seq;
- if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out &&
- !skb->h.th->urg)
- nagle_check = 0;
-
- return (nagle_check && tp->packets_out < tp->snd_cwnd &&
- !after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
- tp->retransmits == 0);
-}
-
/*
* This is the main buffer sending routine. We queue the buffer
* having checked it is sane seeming.
*/
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
{
- struct tcphdr * th = skb->h.th;
+ struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int size;
/* Length of packet (not counting length of pre-tcp headers). */
size = skb->len - ((unsigned char *) th - skb->data);
- /* Sanity check it.. */
- if (size < sizeof(struct tcphdr) || size > skb->len) {
- printk(KERN_DEBUG "tcp_send_skb: bad skb "
- "(skb = %p, data = %p, th = %p, len = %u)\n",
- skb, skb->data, th, skb->len);
- kfree_skb(skb);
- return;
- }
-
- /* If we have queued a header size packet.. (these crash a few
- * tcp stacks if ack is not set)
- * FIXME: What is the equivalent below when we have options?
- */
- if (size == sizeof(struct tcphdr)) {
- /* If it's got a syn or fin discard. */
- if(!th->syn && !th->fin) {
- printk(KERN_DEBUG "tcp_send_skb: attempt to queue a bogon.\n");
- kfree_skb(skb);
- return;
- }
+ /* If there is a FIN or a SYN we add it onto the size. */
+ if (th->fin || th->syn) {
+ if(th->syn)
+ size++;
+ if(th->fin)
+ size++;
}
/* Actual processing. */
@@ -129,14 +85,14 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->write_queue, skb);
- if (tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+ if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
struct sk_buff * buff;
/* This is going straight out. */
tp->last_ack_sent = tp->rcv_nxt;
th->ack_seq = htonl(tp->rcv_nxt);
th->window = htons(tcp_select_window(sk));
- tcp_update_options((__u32 *)(th+1),tp);
+ tcp_update_options((__u32 *)(th + 1),tp);
tp->af_specific->send_check(sk, th, size, skb);
@@ -165,11 +121,10 @@ queue:
/* Remember where we must start sending. */
if (tp->send_head == NULL)
tp->send_head = skb;
- if (tp->packets_out == 0 && !tp->pending) {
+ if (!force_queue && tp->packets_out == 0 && !tp->pending) {
tp->pending = TIME_PROBE0;
tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
}
- return;
}
/*
@@ -214,8 +169,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
buff->h.th = nth;
memcpy(nth, th, tp->tcp_header_len);
- /* FIXME: Make sure this gets tcp options right. */
-
/* Correct the new header. */
buff->seq = skb->seq + len;
buff->end_seq = skb->end_seq;
@@ -281,14 +234,6 @@ static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size)
tp->send_head = skb;
tp->packets_out--;
return -1;
- } else {
-#if 0
- /* If tcp_fragment succeded then
- * the send head is the resulting
- * fragment
- */
- tp->send_head = skb->next;
-#endif
}
return 0;
}
@@ -346,9 +291,10 @@ void tcp_write_xmit(struct sock *sk)
size = skb->len - (((unsigned char*)th) - skb->data);
}
- tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt);
+ tp->last_ack_sent = tp->rcv_nxt;
+ th->ack_seq = htonl(tp->rcv_nxt);
th->window = rcv_wnd;
- tcp_update_options((__u32 *)(th+1),tp);
+ tcp_update_options((__u32 *)(th + 1),tp);
tp->af_specific->send_check(sk, th, size, skb);
@@ -437,128 +383,44 @@ void tcp_write_xmit(struct sock *sk)
* taken by headers, and the remaining space will be available for TCP data.
* This should be accounted for correctly instead.
*/
-unsigned short tcp_select_window(struct sock *sk)
+u32 __tcp_select_window(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- int mss = sk->mss;
- long free_space = sock_rspace(sk) / 2;
- long window, cur_win;
+ unsigned int mss = sk->mss;
+ unsigned int free_space;
+ u32 window, cur_win;
+ free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
if (tp->window_clamp) {
free_space = min(tp->window_clamp, free_space);
mss = min(tp->window_clamp, mss);
- }
-#ifdef NO_ANK_FIX
- /* I am tired of this message */
- else
- printk(KERN_DEBUG "Clamp failure. Water leaking.\n");
-#endif
+ } else {
+ printk("tcp_select_window: tp->window_clamp == 0.\n");
+ }
if (mss < 1) {
mss = 1;
- printk(KERN_DEBUG "tcp_select_window: mss fell to 0.\n");
+ printk("tcp_select_window: sk->mss fell to 0.\n");
}
- /* compute the actual window i.e.
- * old_window - received_bytes_on_that_win
- */
- cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
- window = tp->rcv_wnd;
-
- if (cur_win < 0) {
- cur_win = 0;
-#ifdef NO_ANK_FIX
- /* And this too. */
- printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
- tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
-#endif
- }
-
- if (free_space < sk->rcvbuf/4 && free_space < mss/2)
+ cur_win = tcp_receive_window(tp);
+ if (free_space < sk->rcvbuf/4 && free_space < mss/2) {
window = 0;
-
- /* Get the largest window that is a nice multiple of mss.
- * Window clamp already applied above.
- * If our current window offering is within 1 mss of the
- * free space we just keep it. This prevents the divide
- * and multiply from happening most of the time.
- * We also don't do any window rounding when the free space
- * is too small.
- */
- if (window < free_space - mss && free_space > mss)
- window = (free_space/mss)*mss;
-
- /* Never shrink the offered window */
- if (window < cur_win)
- window = cur_win;
-
- tp->rcv_wnd = window;
- tp->rcv_wup = tp->rcv_nxt;
- return window >> tp->rcv_wscale; /* RFC1323 scaling applied */
-}
-
-#if 0
-/* Old algorithm for window selection */
-unsigned short tcp_select_window(struct sock *sk)
-{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- int mss = sk->mss;
- long free_space = sock_rspace(sk);
- long window, cur_win, usable;
-
- if (tp->window_clamp) {
- free_space = min(tp->window_clamp, free_space);
- mss = min(tp->window_clamp, mss);
- }
-
- /* compute the actual window i.e.
- * old_window - received_bytes_on_that_win
- */
- cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
- window = tp->rcv_wnd;
-
- if (cur_win < 0) {
- cur_win = 0;
- printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
- tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
- }
-
- /* RFC 1122:
- * "the suggested [SWS] avoidance algoritm for the receiver is to keep
- * RECV.NEXT + RCV.WIN fixed until:
- * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
- *
- * i.e. don't raise the right edge of the window until you can raise
- * it at least MSS bytes.
- */
-
- usable = free_space - cur_win;
- if (usable < 0)
- usable = 0;
-
- if (window < usable) {
- /* Window is not blocking the sender
- * and we have enough free space for it
- */
- if (cur_win > (sk->mss << 1))
- goto out;
- }
-
- if (window >= usable) {
- /* We are offering too much, cut it down...
- * but don't shrink the window
- */
- window = max(usable, cur_win);
} else {
- while ((usable - window) >= mss)
- window += mss;
+ /* Get the largest window that is a nice multiple of mss.
+ * Window clamp already applied above.
+ * If our current window offering is within 1 mss of the
+ * free space we just keep it. This prevents the divide
+ * and multiply from happening most of the time.
+ * We also don't do any window rounding when the free space
+ * is too small.
+ */
+ window = tp->rcv_wnd;
+ if ((window <= (free_space - mss)) || (window > free_space))
+ window = (free_space/mss)*mss;
}
-out:
- tp->rcv_wnd = window;
- tp->rcv_wup = tp->rcv_nxt;
return window;
}
-#endif
static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb)
{
@@ -729,84 +591,123 @@ void tcp_do_retransmit(struct sock *sk, int all)
}
}
-/*
- * Send a fin.
+/* Send a fin. The caller locks the socket for us. This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
*/
-
void tcp_send_fin(struct sock *sk)
{
- struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct tcphdr *t1;
- struct sk_buff *buff;
- int tmp;
- buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_KERNEL);
- if (buff == NULL) {
- /* FIXME: This is a disaster if it occurs. */
- printk(KERN_INFO "tcp_send_fin: Impossible malloc failure");
- return;
- }
+ /* Optimization, tack on the FIN if we have a queue of
+ * unsent frames.
+ */
+ if(tp->send_head != NULL) {
+ struct sk_buff *tail = skb_peek_tail(&sk->write_queue);
+ struct tcphdr *th = tail->h.th;
+ int data_len;
+
+ /* Unfortunately tcp_write_xmit won't check for going over
+ * the MSS due to the FIN sequence number, so we have to
+ * watch out for it here.
+ */
+ data_len = (tail->tail - (((unsigned char *)th)+tp->tcp_header_len));
+ if(data_len >= sk->mss)
+ goto build_new_frame; /* ho hum... */
- /* Administrivia. */
- buff->csum = 0;
+ /* tcp_write_xmit() will checksum the header etc. for us. */
+ th->fin = 1;
+ tail->end_seq++;
+ } else {
+ struct sk_buff *buff;
+ struct tcphdr *th;
- /* Put in the IP header and routing stuff. */
- tmp = tp->af_specific->build_net_header(sk, buff);
- if (tmp < 0) {
- int t;
+build_new_frame:
+ buff = sock_wmalloc(sk,
+ (BASE_ACK_SIZE + tp->tcp_header_len +
+ sizeof(struct sk_buff)),
+ 1, GFP_KERNEL);
+ if (buff == NULL) {
+ /* We can only fail due to low memory situations, not
+ * due to going over our sndbuf limits (due to the
+ * force flag passed to sock_wmalloc). So just keep
+ * trying. We cannot allow this fail. The socket is
+ * still locked, so we need not check if the connection
+ * was reset in the meantime etc.
+ */
+ goto build_new_frame;
+ }
- /* FIXME: We must not throw this out. Eventually we must
- * put a FIN into the queue, otherwise it never gets queued.
- */
- kfree_skb(buff);
- sk->write_seq++;
- t = del_timer(&sk->timer);
- if (t)
- add_timer(&sk->timer);
- else
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- return;
- }
-
- /* We ought to check if the end of the queue is a buffer and
- * if so simply add the fin to that buffer, not send it ahead.
- */
- t1 =(struct tcphdr *)skb_put(buff,tp->tcp_header_len);
- buff->h.th = t1;
- tcp_build_options((__u32 *)(t1+1),tp);
-
- memcpy(t1, th, sizeof(*t1));
- buff->seq = sk->write_seq;
- sk->write_seq++;
- buff->end_seq = sk->write_seq;
- t1->seq = htonl(buff->seq);
- t1->ack_seq = htonl(tp->rcv_nxt);
- t1->window = htons(tcp_select_window(sk));
- t1->fin = 1;
-
- tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff);
-
- /* The fin can only be transmited after the data. */
- skb_queue_tail(&sk->write_queue, buff);
- if (tp->send_head == NULL) {
- /* FIXME: BUG! we need to check if the fin fits into the window
- * here. If not we need to do window probing (sick, but true)
+ /* Administrivia. */
+ buff->csum = 0;
+
+ /* Put in the IP header and routing stuff.
+ *
+ * FIXME:
+ * We can fail if the interface for the route
+ * this socket takes goes down right before
+ * we get here. ANK is there a way to point
+ * this into a "black hole" route in such a
+ * case? Ideally, we should still be able to
+ * queue this and let the retransmit timer
+ * keep trying until the destination becomes
+ * reachable once more. -DaveM
*/
- struct sk_buff *skb1;
+ if(tp->af_specific->build_net_header(sk, buff) < 0) {
+ kfree_skb(buff);
+ goto update_write_seq;
+ }
+ th = (struct tcphdr *) skb_put(buff, tp->tcp_header_len);
+ buff->h.th = th;
- tp->packets_out++;
- tp->snd_nxt = sk->write_seq;
- buff->when = jiffies;
+ memcpy(th, (void *) &(sk->dummy_th), sizeof(*th));
+ th->seq = htonl(tp->write_seq);
+ th->fin = 1;
+ tcp_build_options((__u32 *)(th + 1), tp);
- skb1 = skb_clone(buff, GFP_KERNEL);
- if (skb1) {
- skb_set_owner_w(skb1, sk);
- tp->af_specific->queue_xmit(skb1);
- }
+ /* This makes sure we do things like abide by the congestion
+ * window and other constraints which prevent us from sending.
+ */
+ tcp_send_skb(sk, buff, 0);
+ }
+update_write_seq:
+ /* So that we recognize the ACK coming back for
+ * this FIN as being legitimate.
+ */
+ tp->write_seq++;
+}
- if (!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue. This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+ struct tcphdr *th;
+
+again:
+ /* NOTE: No TCP options attached and we never retransmit this. */
+ skb = sock_wmalloc(sk, (BASE_ACK_SIZE + sizeof(*th)), 1, GFP_KERNEL);
+ if(skb == NULL)
+ goto again;
+ skb->csum = 0;
+ if(tp->af_specific->build_net_header(sk, skb) < 0) {
+ kfree_skb(skb);
+ } else {
+ th = (struct tcphdr *) skb_put(skb, sizeof(*th));
+ memcpy(th, &(sk->dummy_th), sizeof(*th));
+ th->seq = htonl(tp->write_seq);
+ th->rst = 1;
+ th->doff = sizeof(*th) / 4;
+ tp->last_ack_sent = tp->rcv_nxt;
+ th->ack_seq = htonl(tp->rcv_nxt);
+ th->window = htons(tcp_select_window(sk));
+ tp->af_specific->send_check(sk, th, sizeof(*th), skb);
+ tp->af_specific->queue_xmit(skb);
+ tcp_statistics.TcpOutSegs++;
+ tcp_statistics.TcpOutRsts++;
}
}
@@ -814,6 +715,9 @@ void tcp_send_fin(struct sock *sk)
* a SYN packet that crossed the incoming SYN that caused this routine
* to get called. If this assumption fails then the initial rcv_wnd
* and rcv_wscale values will not be correct.
+ *
+ * XXX When you have time Dave, redo this to use tcp_send_skb() just
+ * XXX like tcp_send_fin() above now does.... -DaveM
*/
int tcp_send_synack(struct sock *sk)
{
@@ -823,7 +727,7 @@ int tcp_send_synack(struct sock *sk)
struct tcphdr *th;
int tmp;
- skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
+ skb = sock_wmalloc(sk, MAX_SYN_SIZE + sizeof(struct sk_buff), 1, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -855,8 +759,7 @@ int tcp_send_synack(struct sock *sk)
tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt);
tmp = tcp_syn_build_options(skb, sk->mss,
- tp->sack_ok, tp->tstamp_ok,
- tp->wscale_ok,tp->rcv_wscale);
+ tp->tstamp_ok, tp->wscale_ok, tp->rcv_wscale);
skb->csum = 0;
th->doff = (sizeof(*th) + tmp)>>2;
@@ -880,31 +783,24 @@ int tcp_send_synack(struct sock *sk)
}
/*
- * Set up the timers for sending a delayed ack..
- *
- * rules for delaying an ack:
- * - delay time <= 0.5 HZ
- * - must send at least every 2 full sized packets
- * - we don't have a window update to send
+ * Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
+ * for details.
*/
-void tcp_send_delayed_ack(struct sock * sk, int max_timeout)
+void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- unsigned long timeout, now;
+ unsigned long timeout;
- /* Calculate new timeout. */
- now = jiffies;
+ /* Stay within the limit we were given */
timeout = tp->ato;
-
- if (timeout > max_timeout ||
- ((tp->rcv_nxt - tp->rcv_wup) > (sk->mss << 2)))
- timeout = now;
- else
- timeout += now;
+ if (timeout > max_timeout)
+ timeout = max_timeout;
+ timeout += jiffies;
/* Use new timeout only if there wasn't a older one earlier. */
- if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires)
+ if ((!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) ||
+ (timeout < tp->delack_timer.expires))
tp->delack_timer.expires = timeout;
add_timer(&tp->delack_timer);
@@ -928,8 +824,6 @@ void tcp_send_ack(struct sock *sk)
/* We need to grab some memory, and put together an ack,
* and then put it into the queue to be sent.
- * FIXME: is it better to waste memory here and use a
- * constant sized ACK?
*/
buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_ATOMIC);
if (buff == NULL) {
@@ -938,7 +832,7 @@ void tcp_send_ack(struct sock *sk)
* bandwidth on slow links to send a spare ack than
* resend packets.
*/
- tcp_send_delayed_ack(sk, HZ/2);
+ tcp_send_delayed_ack(tp, HZ/2);
return;
}
@@ -956,22 +850,16 @@ void tcp_send_ack(struct sock *sk)
th = (struct tcphdr *)skb_put(buff,tp->tcp_header_len);
memcpy(th, &sk->dummy_th, sizeof(struct tcphdr));
- tcp_build_options((__u32 *)(th+1),tp);
/* Swap the send and the receive. */
th->window = ntohs(tcp_select_window(sk));
th->seq = ntohl(tp->snd_nxt);
tp->last_ack_sent = tp->rcv_nxt;
th->ack_seq = htonl(tp->rcv_nxt);
+ tcp_build_and_update_options((__u32 *)(th + 1), tp);
/* Fill in the packet and send it. */
tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff);
-
-#if 0
- SOCK_DEBUG(sk, "\rtcp_send_ack: seq %x ack %x\n",
- tp->snd_nxt, tp->rcv_nxt);
-#endif
-
tp->af_specific->queue_xmit(buff);
tcp_statistics.TcpOutSegs++;
}
@@ -1017,6 +905,7 @@ void tcp_write_wakeup(struct sock *sk)
}
th = skb->h.th;
+ tcp_update_options((__u32 *)(th + 1), tp);
tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, skb);
buff = skb_clone(skb, GFP_ATOMIC);
if (buff == NULL)
@@ -1047,25 +936,19 @@ void tcp_write_wakeup(struct sock *sk)
return;
}
- t1 = (struct tcphdr *) skb_put(buff, sizeof(struct tcphdr));
+ t1 = (struct tcphdr *) skb_put(buff, tp->tcp_header_len);
memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
- /* FIXME: should zero window probes have SACK and/or TIMESTAMP data?
- * If so we have to tack them on here.
- */
/* Use a previous sequence.
* This should cause the other end to send an ack.
*/
t1->seq = htonl(tp->snd_nxt-1);
-/* t1->fin = 0; -- We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
t1->ack_seq = htonl(tp->rcv_nxt);
t1->window = htons(tcp_select_window(sk));
+ tcp_build_and_update_options((__u32 *)(t1 + 1), tp);
- /* Value from dummy_th may be larger. */
- t1->doff = sizeof(struct tcphdr)/4;
-
- tp->af_specific->send_check(sk, t1, sizeof(*t1), buff);
+ tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff);
}
/* Send it. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 76ccedab2..fdf8f50ec 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.5 1998/03/03 01:23:44 ralf Exp $
+ * Version: $Id: tcp_timer.c,v 1.6 1998/03/17 22:18:35 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@ int sysctl_tcp_retries2 = TCP_RETR2;
static void tcp_sltimer_handler(unsigned long);
static void tcp_syn_recv_timer(unsigned long);
static void tcp_keepalive(unsigned long data);
+static void tcp_bucketgc(unsigned long);
struct timer_list tcp_slow_timer = {
NULL, NULL,
@@ -41,7 +42,8 @@ struct timer_list tcp_slow_timer = {
struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
- {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive} /* KEEPALIVE */
+ {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */
+ {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */
};
const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
@@ -87,20 +89,24 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
* The delayed ack timer can be set if we are changing the
* retransmit timer when removing acked frames.
*/
- del_timer(&tp->probe_timer);
- del_timer(&tp->retransmit_timer);
+ if(tp->probe_timer.prev)
+ del_timer(&tp->probe_timer);
+ if(tp->retransmit_timer.prev)
+ del_timer(&tp->retransmit_timer);
tp->retransmit_timer.expires=jiffies+when;
add_timer(&tp->retransmit_timer);
break;
case TIME_DACK:
- del_timer(&tp->delack_timer);
+ if(tp->delack_timer.prev)
+ del_timer(&tp->delack_timer);
tp->delack_timer.expires=jiffies+when;
add_timer(&tp->delack_timer);
break;
case TIME_PROBE0:
- del_timer(&tp->probe_timer);
+ if(tp->probe_timer.prev)
+ del_timer(&tp->probe_timer);
tp->probe_timer.expires=jiffies+when;
add_timer(&tp->probe_timer);
break;
@@ -118,9 +124,12 @@ void tcp_clear_xmit_timers(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- del_timer(&tp->retransmit_timer);
- del_timer(&tp->delack_timer);
- del_timer(&tp->probe_timer);
+ if(tp->retransmit_timer.prev)
+ del_timer(&tp->retransmit_timer);
+ if(tp->delack_timer.prev)
+ del_timer(&tp->delack_timer);
+ if(tp->probe_timer.prev)
+ del_timer(&tp->probe_timer);
}
static int tcp_write_err(struct sock *sk, int force)
@@ -131,9 +140,8 @@ static int tcp_write_err(struct sock *sk, int force)
tcp_clear_xmit_timers(sk);
/* Time wait the socket. */
- if (!force && (1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
- tcp_set_state(sk,TCP_TIME_WAIT);
- tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
+ tcp_time_wait(sk);
} else {
/* Clean up time. */
tcp_set_state(sk, TCP_CLOSE);
@@ -173,9 +181,8 @@ static int tcp_write_timeout(struct sock *sk)
return 1;
}
-
-void tcp_delack_timer(unsigned long data) {
-
+void tcp_delack_timer(unsigned long data)
+{
struct sock *sk = (struct sock*)data;
if(sk->zapped)
@@ -185,8 +192,8 @@ void tcp_delack_timer(unsigned long data) {
tcp_read_wakeup(sk);
}
-void tcp_probe_timer(unsigned long data) {
-
+void tcp_probe_timer(unsigned long data)
+{
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -212,10 +219,9 @@ void tcp_probe_timer(unsigned long data) {
sk->err = ETIMEDOUT;
sk->error_report(sk);
- /* Time wait the socket. */
if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
- tcp_set_state(sk, TCP_TIME_WAIT);
- tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ /* Time wait the socket. */
+ tcp_time_wait(sk);
} else {
/* Clean up time. */
tcp_set_state(sk, TCP_CLOSE);
@@ -252,6 +258,35 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk)
return res;
}
+/* Garbage collect TCP bind buckets. */
+static void tcp_bucketgc(unsigned long __unused)
+{
+ int i;
+
+ for(i = 0; i < TCP_BHTABLE_SIZE; i++) {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[i];
+
+ while(tb) {
+ struct tcp_bind_bucket *next = tb->next;
+
+ if((tb->owners == NULL) &&
+ !(tb->flags & TCPB_FLAG_LOCKED)) {
+ /* Eat timer reference. */
+ tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+
+ /* Unlink bucket. */
+ if(tb->next)
+ tb->next->pprev = tb->pprev;
+ *tb->pprev = tb->next;
+
+ /* Finally, free it up. */
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+ tb = next;
+ }
+ }
+}
+
/*
* Check all sockets for keepalive timer
* Called every 75 seconds
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
index fe02b3f4c..79ae3309e 100644
--- a/net/ipv4/timer.c
+++ b/net/ipv4/timer.c
@@ -5,7 +5,7 @@
*
* TIMER - implementation of software timers for IP.
*
- * Version: $Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $
+ * Version: $Id: timer.c,v 1.2 1997/12/16 05:37:48 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -52,76 +52,52 @@
void net_delete_timer (struct sock *t)
{
- unsigned long flags;
-
- save_flags (flags);
- cli();
-
+ if(t->timer.prev)
+ del_timer (&t->timer);
t->timeout = 0;
- del_timer (&t->timer);
-
- restore_flags (flags);
}
void net_reset_timer (struct sock *t, int timeout, unsigned long len)
{
net_delete_timer (t);
t->timeout = timeout;
-#if 1
- /* FIXME: ??? */
- if ((int) len < 0) /* prevent close to infinite timers. THEY _DO_ */
- len = 3; /* happen (negative values ?) - don't ask me why ! -FB */
-#endif
t->timer.expires = jiffies+len;
add_timer (&t->timer);
}
-
-/*
- * Now we will only be called whenever we need to do
- * something, but we must be sure to process all of the
- * sockets that need it.
+/* Now we will only be called whenever we need to do
+ * something, but we must be sure to process all of the
+ * sockets that need it.
*/
-
void net_timer (unsigned long data)
{
struct sock *sk = (struct sock*)data;
int why = sk->timeout;
- /*
- * only process if socket is not in use
- */
-
- if (sk->sock_readers)
- {
+ /* Only process if socket is not in use. */
+ if (sk->sock_readers) {
sk->timer.expires = jiffies+HZ;
add_timer(&sk->timer);
- sti();
return;
}
/* Always see if we need to send an ack. */
-
- if (sk->ack_backlog && !sk->zapped)
- {
+ if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) {
sk->prot->read_wakeup (sk);
- if (! sk->dead)
- sk->data_ready(sk,0);
+ if (!sk->dead)
+ sk->data_ready(sk,0);
}
/* Now we need to figure out why the socket was on the timer. */
-
- switch (why)
- {
+ switch (why) {
case TIME_DONE:
- /* If the socket hasn't been closed off, re-try a bit later */
+ /* If the socket hasn't been closed off, re-try a bit later. */
if (!sk->dead) {
net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME);
break;
}
- if (sk->state != TCP_CLOSE)
- {
+ if (sk->state != TCP_CLOSE) {
printk (KERN_DEBUG "non CLOSE socket in time_done\n");
break;
}
@@ -129,11 +105,9 @@ void net_timer (unsigned long data)
break;
case TIME_DESTROY:
- /*
- * We've waited for a while for all the memory associated with
- * the socket to be freed.
- */
-
+ /* We've waited for a while for all the memory associated with
+ * the socket to be freed.
+ */
destroy_sock(sk);
break;
@@ -148,7 +122,8 @@ void net_timer (unsigned long data)
break;
default:
- printk (KERN_DEBUG "net_timer: timer expired - reason %d is unknown\n", why);
+ /* I want to see these... */
+ printk ("net_timer: timer expired - reason %d is unknown\n", why);
break;
}
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f355caa85..6ba50b280 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.3 1998/03/03 01:23:44 ralf Exp $
+ * Version: $Id: udp.c,v 1.4 1998/03/17 22:18:36 ralf Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -828,7 +828,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
* of this packet since that is all
* that will be read.
*/
- amount = skb->tail - skb->h.raw;
+ amount = skb->len - sizeof(struct udphdr);
}
return put_user(amount, (int *)arg);
}
@@ -1033,17 +1033,18 @@ static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
/*
* Multicasts and broadcasts go to each listener.
+ *
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
*/
static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
u32 saddr, u32 daddr)
{
struct sock *sk;
- int given = 0;
- SOCKHASH_LOCK();
sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr);
- if(sk) {
+ if (sk) {
struct sock *sknext = NULL;
do {
@@ -1058,10 +1059,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
udp_deliver(sk, skb1);
sk = sknext;
} while(sknext);
- given = 1;
- }
- SOCKHASH_UNLOCK();
- if(!given)
+ } else
kfree_skb(skb);
return 0;
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c4faba4b7..4a4060601 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: addrconf.c,v 1.32 1997/12/27 20:41:18 kuznet Exp $
+ * $Id: addrconf.c,v 1.37 1998/03/08 20:52:46 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -1753,6 +1753,8 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0);
if (t->sysctl_header == NULL)
kfree(t);
+ else
+ p->sysctl = t;
}
static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b0a0eb702..bc5ba892a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -7,7 +7,7 @@
*
* Adapted from linux/net/ipv4/af_inet.c
*
- * $Id: af_inet6.c,v 1.24 1997/12/13 21:53:08 kuznet Exp $
+ * $Id: af_inet6.c,v 1.28 1998/03/08 05:56:49 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -71,7 +71,7 @@ static int inet6_create(struct socket *sock, int protocol)
struct sock *sk;
struct proto *prot;
- sk = sk_alloc(AF_INET6, GFP_KERNEL);
+ sk = sk_alloc(AF_INET6, GFP_KERNEL, 1);
if (sk == NULL)
goto do_oom;
@@ -139,8 +139,7 @@ static int inet6_create(struct socket *sock, int protocol)
* creation time automatically shares.
*/
sk->dummy_th.source = ntohs(sk->num);
- if(sk->prot->hash)
- sk->prot->hash(sk);
+ sk->prot->hash(sk);
add_to_prot_sklist(sk);
}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 6b7508666..af29057ec 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: exthdrs.c,v 1.4 1997/03/18 18:24:29 davem Exp $
+ * $Id: exthdrs.c,v 1.5 1998/02/12 07:43:39 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b84dc9268..96867403b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: icmp.c,v 1.12 1997/12/13 21:53:10 kuznet Exp $
+ * $Id: icmp.c,v 1.13 1998/02/12 07:43:41 davem Exp $
*
* Based on net/ipv4/icmp.c
*
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 15ce420ac..9fce1acca 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_fib.c,v 1.10 1997/12/13 21:53:10 kuznet Exp $
+ * $Id: ip6_fib.c,v 1.11 1998/03/08 05:56:50 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c
index 7316a30f1..3c3a0cfc5 100644
--- a/net/ipv6/ip6_fw.c
+++ b/net/ipv6/ip6_fw.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_fw.c,v 1.8 1997/12/13 21:53:11 kuznet Exp $
+ * $Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index ead32047a..71ad7e1a0 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -6,7 +6,7 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Ian P. Morris <I.P.Morris@soton.ac.uk>
*
- * $Id: ip6_input.c,v 1.7 1997/09/20 20:48:27 davem Exp $
+ * $Id: ip6_input.c,v 1.8 1998/02/12 07:43:43 davem Exp $
*
* Based in linux/net/ipv4/ip_input.c
*
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 67b81d041..13029e175 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_output.c,v 1.7 1997/12/29 19:52:46 kuznet Exp $
+ * $Id: ip6_output.c,v 1.9 1998/03/08 05:56:50 davem Exp $
*
* Based on linux/net/ipv4/ip_output.c
*
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f2ef3fd76..c6714eea3 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -7,7 +7,7 @@
*
* Based on linux/net/ipv4/ip_sockglue.c
*
- * $Id: ipv6_sockglue.c,v 1.16 1997/12/13 21:53:13 kuznet Exp $
+ * $Id: ipv6_sockglue.c,v 1.17 1998/03/08 05:56:51 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3fb0680bc..ce37117a3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -132,7 +132,7 @@ struct neigh_table nd_tbl =
pndisc_destructor,
pndisc_redo,
{ NULL, NULL, &nd_tbl, 0, NULL, NULL,
- 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 0, 64 },
+ 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 0 },
30*HZ, 128, 512, 1024,
};
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index b9b811e35..b87d4696b 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -7,7 +7,7 @@
* PROC file system. This is very similar to the IPv4 version,
* except it reports the sockets in the INET6 address family.
*
- * Version: $Id: proc.c,v 1.4 1997/04/20 22:50:44 schenk Exp $
+ * Version: $Id: proc.c,v 1.6 1998/03/13 08:02:19 davem Exp $
*
* Authors: David S. Miller (davem@caip.rutgers.edu)
*
@@ -21,6 +21,7 @@
#include <linux/net.h>
#include <linux/in6.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include <net/transp_v6.h>
/* This is the main implementation workhorse of all these routines. */
@@ -52,21 +53,35 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta
SOCKHASH_LOCK();
sp = pro->sklist_next;
while(sp != (struct sock *)pro) {
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp;
+ int tw_bucket = 0;
+
pos += 149;
if(pos < offset)
goto next;
tp = &(sp->tp_pinfo.af_tcp);
- dest = &sp->net_pinfo.af_inet6.daddr;
- src = &sp->net_pinfo.af_inet6.rcv_saddr;
+ if((format == 0) && (sp->state == TCP_TIME_WAIT)) {
+ tw_bucket = 1;
+ dest = &tw->v6_daddr;
+ src = &tw->v6_rcv_saddr;
+ } else {
+ dest = &sp->net_pinfo.af_inet6.daddr;
+ src = &sp->net_pinfo.af_inet6.rcv_saddr;
+ }
destp = ntohs(sp->dummy_th.dest);
srcp = ntohs(sp->dummy_th.source);
-
- timer_active1 = del_timer(&tp->retransmit_timer);
- timer_active2 = del_timer(&sp->timer);
- if(!timer_active1) tp->retransmit_timer.expires = 0;
- if(!timer_active2) sp->timer.expires = 0;
- timer_active = 0;
- timer_expires = (unsigned) -1;
+ if((format == 0) && (sp->state == TCP_TIME_WAIT)) {
+ timer_active1 = timer_active2 = 0;
+ timer_active = 3;
+ timer_expires = tw->timer.expires;
+ } else {
+ timer_active1 = del_timer(&tp->retransmit_timer);
+ timer_active2 = del_timer(&sp->timer);
+ if(!timer_active1) tp->retransmit_timer.expires = 0;
+ if(!timer_active2) sp->timer.expires = 0;
+ timer_active = 0;
+ timer_expires = (unsigned) -1;
+ }
if(timer_active1 && tp->retransmit_timer.expires < timer_expires) {
timer_active = timer_active1;
timer_expires = tp->retransmit_timer.expires;
@@ -75,6 +90,8 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta
timer_active = timer_active2;
timer_expires = sp->timer.expires;
}
+ if(timer_active == 0)
+ timer_expires = jiffies;
sprintf(tmpbuf, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
i,
@@ -83,13 +100,23 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
sp->state,
- format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc),
- format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc),
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->write_seq-tp->snd_una :
+ atomic_read(&sp->wmem_alloc)),
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->rcv_nxt-tp->copied_seq :
+ atomic_read(&sp->rmem_alloc)),
timer_active, timer_expires-jiffies,
- tp->retransmits,
- sp->socket ? sp->socket->inode->i_uid:0,
- timer_active?sp->timeout:0,
- sp->socket ? sp->socket->inode->i_ino:0);
+ (tw_bucket ? 0 : tp->retransmits),
+ ((!tw_bucket && sp->socket) ?
+ sp->socket->inode->i_uid : 0),
+ (!tw_bucket && timer_active) ? sp->timeout : 0,
+ ((!tw_bucket && sp->socket) ?
+ sp->socket->inode->i_ino : 0));
if(timer_active1) add_timer(&tp->retransmit_timer);
if(timer_active2) add_timer(&sp->timer);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4ee1b13ad..5b182b7ef 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -7,7 +7,7 @@
*
* Adapted from linux/net/ipv4/raw.c
*
- * $Id: raw.c,v 1.16 1997/12/29 19:52:48 kuznet Exp $
+ * $Id: raw.c,v 1.18 1998/03/08 05:56:54 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index aa027da14..55fecc676 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: reassembly.c,v 1.8 1997/12/29 19:52:50 kuznet Exp $
+ * $Id: reassembly.c,v 1.9 1998/02/12 07:43:48 davem Exp $
*
* Based on: net/ipv4/ip_fragment.c
*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 28ee43e78..5188de864 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: route.c,v 1.19 1997/12/13 21:53:16 kuznet Exp $
+ * $Id: route.c,v 1.25 1998/03/15 03:31:47 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -85,18 +85,18 @@ struct dst_ops ip6_dst_ops = {
};
struct rt6_info ip6_null_entry = {
- {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL,
+ {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL,
-1, 0, 0, 0, 0, 0, 0, 0, 0,
-ENETUNREACH, NULL, NULL,
ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}},
NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U,
- 0, 255, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128}
+ 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
};
struct fib6_node ip6_routing_table = {
NULL, NULL, NULL, NULL,
&ip6_null_entry,
- 0, RTN_ROOT|RTN_TL_ROOT, 0
+ 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0
};
#ifdef CONFIG_RT6_POLICY
@@ -709,14 +709,14 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
if (rt == NULL) {
RDBG(("dalloc fails, "));
*err = -ENOMEM;
- goto out;
+ return NULL;
}
rt->u.dst.obsolete = -1;
rt->rt6i_expires = rtmsg->rtmsg_info;
addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
-
+
if (addr_type & IPV6_ADDR_MULTICAST) {
RDBG(("MCAST, "));
rt->u.dst.input = ip6_mc_input;
@@ -743,6 +743,21 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
+ /* We cannot add true routes via loopback here,
+ they would result in kernel looping; promote them to reject routes
+ */
+ if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
+ (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
+ dev = dev_get("lo");
+ rt->u.dst.output = ip6_pkt_discard;
+ rt->u.dst.input = ip6_pkt_discard;
+ rt->u.dst.error = -ENETUNREACH;
+ rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+ rt->rt6i_metric = rtmsg->rtmsg_metric;
+ rt->rt6i_dev = dev;
+ goto install_route;
+ }
+
if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
struct in6_addr *gw_addr;
int gwa_type;
@@ -773,7 +788,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
}
dev = grt->rt6i_dev;
}
- if (dev == NULL) {
+ if (dev == NULL || (dev->flags&IFF_LOOPBACK)) {
*err = -EINVAL;
goto out;
}
@@ -805,6 +820,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
rt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
rt->rt6i_flags = rtmsg->rtmsg_flags;
+install_route:
RDBG(("rt6ins(%p) ", rt));
rt6_lock();
@@ -1421,6 +1437,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
int ip6_pkt_discard(struct sk_buff *skb)
{
ipv6_statistics.Ip6OutNoRoutes++;
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
kfree_skb(skb);
return 0;
}
@@ -1671,7 +1688,8 @@ static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
rtmsg->rtmsg_dst_len = r->rtm_dst_len;
rtmsg->rtmsg_src_len = r->rtm_src_len;
rtmsg->rtmsg_flags = RTF_UP;
- rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
+ if (r->rtm_type == RTN_UNREACHABLE)
+ rtmsg->rtmsg_flags |= RTF_REJECT;
if (rta[RTA_GATEWAY-1]) {
if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
@@ -1754,7 +1772,12 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
rtm->rtm_src_len = rt->rt6i_src.plen;
rtm->rtm_tos = 0;
rtm->rtm_table = RT_TABLE_MAIN;
- rtm->rtm_type = RTN_UNICAST;
+ if (rt->rt6i_flags&RTF_REJECT)
+ rtm->rtm_type = RTN_UNREACHABLE;
+ else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
+ rtm->rtm_type = RTN_LOCAL;
+ else
+ rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
#ifdef CONFIG_RTNL_OLD_IFINFO
@@ -1795,6 +1818,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
if (rt->u.dst.rtt)
RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
mx->rta_len = skb->tail - (u8*)mx;
+ if (mx->rta_len == RTA_LENGTH(0))
+ skb_trim(skb, (u8*)mx - skb->data);
#endif
if (rt->u.dst.neighbour)
RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index f029942df..577b85d0f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.24 1997/12/13 21:53:17 kuznet Exp $
+ * $Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f7a080a0d..1d082c195 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -5,7 +5,7 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: tcp_ipv6.c,v 1.44 1997/12/13 21:53:18 kuznet Exp $
+ * $Id: tcp_ipv6.c,v 1.60 1998/03/15 02:59:32 davem Exp $
*
* Based on:
* linux/net/ipv4/tcp.c
@@ -44,7 +44,6 @@
#define ICMP_PARANOIA
-extern int sysctl_tcp_sack;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
@@ -86,62 +85,69 @@ static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
/* Grrr, addr_type already calculated by caller, but I don't want
* to add some silly "cookie" argument to this method just for that.
+ * But it doesn't matter, the recalculation is in the rarest path
+ * this function ever takes.
*/
static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum)
{
- struct sock *sk2;
- int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr);
- int retval = 0, sk_reuse = sk->reuse;
+ struct tcp_bind_bucket *tb;
+ int result = 0;
SOCKHASH_LOCK();
- sk2 = tcp_bound_hash[tcp_sk_bhashfn(sk)];
- for(; sk2 != NULL; sk2 = sk2->bind_next) {
- if((sk2->num == snum) && (sk2 != sk)) {
- unsigned char state = sk2->state;
- int sk2_reuse = sk2->reuse;
- if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) {
- if((!sk2_reuse) ||
- (!sk_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
- }
- } else if(!ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr,
- &sk2->net_pinfo.af_inet6.rcv_saddr)) {
- if((!sk_reuse) ||
- (!sk2_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ (tb && (tb->port != snum));
+ tb = tb->next)
+ ;
+ if(tb && tb->owners) {
+ /* Fast path for reuse ports, see include/net/tcp.h for a very
+ * detailed description of why this works, and why it is worth
+ * the effort at all. -DaveM
+ */
+ if((tb->flags & TCPB_FLAG_FASTREUSE) &&
+ (sk->reuse != 0)) {
+ goto go_like_smoke;
+ } else {
+ struct sock *sk2;
+ int sk_reuse = sk->reuse;
+ int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr);
+
+ /* We must walk the whole port owner list in this case. -DaveM */
+ for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
+ if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
+ if(addr_type == IPV6_ADDR_ANY ||
+ !sk2->rcv_saddr ||
+ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr,
+ &sk2->net_pinfo.af_inet6.rcv_saddr))
+ break;
}
}
+ if(sk2 != NULL)
+ result = 1;
}
}
+ if((result == 0) &&
+ (tb == NULL) &&
+ (tcp_bucket_create(snum) == NULL))
+ result = 1;
+go_like_smoke:
SOCKHASH_UNLOCK();
-
- return retval;
+ return result;
}
static void tcp_v6_hash(struct sock *sk)
{
- unsigned char state;
-
- SOCKHASH_LOCK();
- state = sk->state;
- if(state != TCP_CLOSE) {
+ if(sk->state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN)
- skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- else
- skp = &tcp_established_hash[tcp_v6_sk_hashfn(sk)];
+ SOCKHASH_LOCK();
+ skp = &tcp_established_hash[(sk->hashent = tcp_v6_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
tcp_sk_bindify(sk);
+ SOCKHASH_UNLOCK();
}
- SOCKHASH_UNLOCK();
}
static void tcp_v6_unhash(struct sock *sk)
@@ -153,6 +159,7 @@ static void tcp_v6_unhash(struct sock *sk)
*sk->pprev = sk->next;
sk->pprev = NULL;
tcp_sk_unbindify(sk);
+ tcp_reg_zap(sk);
}
SOCKHASH_UNLOCK();
}
@@ -163,29 +170,27 @@ static void tcp_v6_rehash(struct sock *sk)
SOCKHASH_LOCK();
state = sk->state;
- if(sk->pprev) {
+ if(sk->pprev != NULL) {
if(sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
- tcp_sk_unbindify(sk);
+ tcp_reg_zap(sk);
}
if(state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN) {
+ if(state == TCP_LISTEN)
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- } else {
- int hash = tcp_v6_sk_hashfn(sk);
- if(state == TCP_TIME_WAIT)
- hash += (TCP_HTABLE_SIZE/2);
- skp = &tcp_established_hash[hash];
- }
+ else
+ skp = &tcp_established_hash[(sk->hashent = tcp_v6_sk_hashfn(sk))];
+
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
- tcp_sk_bindify(sk);
+ if(state == TCP_LISTEN)
+ tcp_sk_bindify(sk);
}
SOCKHASH_UNLOCK();
}
@@ -209,8 +214,12 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor
return result;
}
+/* Until this is verified... -DaveM */
+/* #define USE_QUICKSYNS */
+
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ * It is assumed that this code only gets called from within NET_BH.
*/
static inline struct sock *__tcp_v6_lookup(struct tcphdr *th,
struct in6_addr *saddr, u16 sport,
@@ -218,30 +227,53 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th,
{
unsigned short hnum = ntohs(dport);
struct sock *sk;
- int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
+ int hash;
+
+#ifdef USE_QUICKSYNS
+ /* Incomming connection short-cut. */
+ if (th && th->syn == 1 && th->ack == 0)
+ goto listener_shortcut;
+#endif
+
+ /* Check TCP register quick cache first. */
+ sk = TCP_RHASH(sport);
+ if(sk &&
+ sk->num == hnum && /* local port */
+ sk->family == AF_INET6 && /* address family */
+ sk->dummy_th.dest == sport && /* remote port */
+ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) &&
+ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr))
+ goto hit;
/* Optimize here for direct hit, only listening connections can
- * have wildcards anyways. It is assumed that this code only
- * gets called from within NET_BH.
+ * have wildcards anyways.
*/
- for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
+ hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
+ for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
/* For IPV6 do the cheaper port and family tests first. */
if(sk->num == hnum && /* local port */
sk->family == AF_INET6 && /* address family */
sk->dummy_th.dest == sport && /* remote port */
!ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) &&
- !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr))
+ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) {
+ if (sk->state == TCP_ESTABLISHED)
+ TCP_RHASH(sport) = sk;
goto hit; /* You sunk my battleship! */
-
+ }
+ }
/* Must check for a TIME_WAIT'er before going to listener hash. */
for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
if(sk->num == hnum && /* local port */
sk->family == AF_INET6 && /* address family */
- sk->dummy_th.dest == sport && /* remote port */
- !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) &&
- !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr))
- goto hit;
-
+ sk->dummy_th.dest == sport) { /* remote port */
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ if(!ipv6_addr_cmp(&tw->v6_daddr, saddr) &&
+ !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr))
+ goto hit;
+ }
+#ifdef USE_QUICKSYNS
+listener_shortcut:
+#endif
sk = tcp_v6_lookup_listener(daddr, hnum);
hit:
return sk;
@@ -275,6 +307,33 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
skb->h.th->source);
}
+static int tcp_v6_unique_address(struct sock *sk)
+{
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
+ int retval = 1;
+
+ /* Freeze the hash while we snoop around. */
+ SOCKHASH_LOCK();
+ tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ for(; tb; tb = tb->next) {
+ if(tb->port == snum && tb->owners != NULL) {
+ /* Almost certainly the re-use port case, search the real hashes
+ * so it actually scales. (we hope that all ipv6 ftp servers will
+ * use passive ftp, I just cover this case for completeness)
+ */
+ sk = __tcp_v6_lookup(NULL, &sk->net_pinfo.af_inet6.daddr,
+ sk->dummy_th.dest,
+ &sk->net_pinfo.af_inet6.rcv_saddr, snum);
+ if((sk != NULL) && (sk->state != TCP_LISTEN))
+ retval = 0;
+ break;
+ }
+ }
+ SOCKHASH_UNLOCK();
+ return retval;
+}
+
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
@@ -390,7 +449,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
ipv6_addr_copy(&np->saddr, saddr);
}
- /* FIXME: Need to do tcp_v6_unique_address() here! -DaveM */
+ sk->dummy_th.dest = usin->sin6_port;
+ if (!tcp_v6_unique_address(sk))
+ return -EADDRNOTAVAIL;
/*
* Init variables
@@ -398,16 +459,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
lock_sock(sk);
- sk->dummy_th.dest = usin->sin6_port;
- sk->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3],
+ tp->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3],
np->daddr.s6_addr32[3],
sk->dummy_th.source,
sk->dummy_th.dest);
tp->snd_wnd = 0;
tp->snd_wl1 = 0;
- tp->snd_wl2 = sk->write_seq;
- tp->snd_una = sk->write_seq;
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
tp->rcv_nxt = 0;
@@ -415,30 +475,35 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
release_sock(sk);
- buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
-
- if (buff == NULL)
+ buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)),
+ 0, GFP_KERNEL);
+ if (buff == NULL) {
+ /* FIXME: Free route references etc??? */
return(-ENOMEM);
+ }
lock_sock(sk);
tcp_v6_build_header(sk, buff);
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
/* build the tcp header */
th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
buff->h.th = th;
memcpy(th, (void *) &(sk->dummy_th), sizeof(*th));
- buff->seq = sk->write_seq++;
+ buff->seq = tp->write_seq++;
th->seq = htonl(buff->seq);
- tp->snd_nxt = sk->write_seq;
- buff->end_seq = sk->write_seq;
+ tp->snd_nxt = tp->write_seq;
+ buff->end_seq = tp->write_seq;
th->ack = 0;
th->syn = 1;
sk->mtu = dst->pmtu;
- sk->mss = sk->mtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
+ sk->mss = (sk->mtu - sizeof(struct ipv6hdr) - tp->tcp_header_len);
if (sk->mss < 1) {
printk(KERN_DEBUG "intial ipv6 sk->mss below 1\n");
@@ -457,8 +522,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
* Put in the TCP options to say MTU.
*/
- tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
- sysctl_tcp_timestamps,
+ tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps,
sysctl_tcp_window_scaling,tp->rcv_wscale);
th->doff = sizeof(*th)/4 + (tmp>>2);
buff->csum = 0;
@@ -467,9 +531,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tcp_set_state(sk, TCP_SYN_SENT);
/* Socket identity change complete, no longer
- * in TCP_CLOSE, so rehash.
+ * in TCP_CLOSE, so enter ourselves into the
+ * hash tables.
*/
- sk->prot->rehash(sk);
+ sk->prot->hash(sk);
/* FIXME: should use dcache->rtt if availiable */
tp->rto = TCP_TIMEOUT_INIT;
@@ -482,12 +547,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->packets_out++;
buff->when = jiffies;
skb1 = skb_clone(buff, GFP_KERNEL);
- skb_set_owner_w(skb1, sk);
-
- tcp_v6_xmit(skb1);
+ if(skb1 != NULL) {
+ skb_set_owner_w(skb1, sk);
+ tcp_v6_xmit(skb1);
+ }
/* Timer for repeating the SYN until an answer */
-
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
tcp_statistics.TcpActiveOpens++;
tcp_statistics.TcpOutSegs++;
@@ -499,6 +564,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
{
+ struct tcp_opt *tp;
struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
int retval = -EINVAL;
@@ -530,7 +596,10 @@ static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
lock_sock(sk);
retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
msg->msg_flags);
-
+ /* Push out partial tail frames if needed. */
+ tp = &(sk->tp_pinfo.af_tcp);
+ if(tp->send_head && tcp_snd_test(sk, tp->send_head))
+ tcp_write_xmit(sk);
release_sock(sk);
out:
@@ -555,7 +624,7 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info,
sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source);
- if (sk == NULL) {
+ if (sk == NULL || sk->state == TCP_TIME_WAIT) {
/* XXX: Update ICMP error count */
return;
}
@@ -596,11 +665,14 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info,
ip6_dst_store(sk, dst);
}
- if (sk->dst_cache->error)
+ if (sk->dst_cache->error) {
sk->err_soft = sk->dst_cache->error;
- else
+ } else {
+ /* FIXME: Reset sk->mss, taking into account TCP option
+ * bytes for timestamps. -DaveM
+ */
sk->mtu = sk->dst_cache->pmtu;
-
+ }
if (sk->sock_readers) { /* remove later */
printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n");
return;
@@ -713,11 +785,10 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
* match what happens under IPV4. Figure out the right thing to do.
*/
req->mss = min(sk->mss, req->mss);
-
- if (req->mss < 1) {
- printk(KERN_DEBUG "initial req->mss below 1\n");
- req->mss = 1;
- }
+ if(sk->user_mss)
+ req->mss = min(req->mss, sk->user_mss);
+ if(req->tstamp_ok == 0)
+ req->mss += TCPOLEN_TSTAMP_ALIGNED;
if (req->rcv_wnd == 0) {
__u8 rcv_wscale;
@@ -732,7 +803,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
}
th->window = htons(req->rcv_wnd);
- tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
+ tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok,
req->wscale_ok,req->rcv_wscale);
skb->csum = 0;
th->doff = (sizeof(*th) + tmp)>>2;
@@ -740,9 +811,13 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
&req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
csum_partial((char *)th, sizeof(*th)+tmp, skb->csum));
+ /* Actually we should not attach dst to socket in state LISTEN,
+ it results in stale destination per listen socket and
+ overflow of routing cache.
+ (IPv4 has the same flaw with more unpleasant consequences.)
+ */
ip6_dst_store(sk, dst);
ip6_xmit(sk, skb, &fl, req->af.v6_req.opt);
- dst_release(dst);
tcp_statistics.TcpOutSegs++;
}
@@ -801,14 +876,15 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
req->rcv_isn = skb->seq;
req->snt_isn = isn;
- tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+ tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0;
tp.in_mss = 536;
tcp_parse_options(skb->h.th,&tp,0);
- if (tp.saw_tstamp)
- req->ts_recent = tp.rcv_tsval;
req->mss = tp.in_mss;
+ if (tp.saw_tstamp) {
+ req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+ req->ts_recent = tp.rcv_tsval;
+ }
req->tstamp_ok = tp.tstamp_ok;
- req->sack_ok = tp.sack_ok;
req->snd_wscale = tp.snd_wscale;
req->wscale_ok = tp.wscale_ok;
req->rmt_port = skb->h.th->source;
@@ -879,92 +955,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
return newsk;
}
- newsk = sk_alloc(AF_INET6, GFP_ATOMIC);
+ newsk = tcp_create_openreq_child(sk, req, skb);
if (newsk == NULL) {
- if (dst)
- dst_release(dst);
+ dst_release(dst);
return NULL;
}
- memcpy(newsk, sk, sizeof(*newsk));
-
- /* Or else we die! -DaveM */
- newsk->sklist_next = NULL;
-
- newsk->opt = NULL;
newsk->dst_cache = NULL;
- skb_queue_head_init(&newsk->write_queue);
- skb_queue_head_init(&newsk->receive_queue);
- skb_queue_head_init(&newsk->out_of_order_queue);
- skb_queue_head_init(&newsk->error_queue);
-
- /*
- * Unused
- */
newtp = &(newsk->tp_pinfo.af_tcp);
- np = &newsk->net_pinfo.af_inet6;
-
- newtp->send_head = NULL;
- newtp->retrans_head = NULL;
-
- newtp->pending = 0;
-
- skb_queue_head_init(&newsk->back_log);
-
- newsk->prot->init(newsk);
-
- newtp->snd_cwnd_cnt = 0;
-#if 0 /* Don't mess up the initialization we did in the init routine! */
- newtp->snd_ssthresh = 0;
-#endif
- newtp->backoff = 0;
- newsk->proc = 0;
- newsk->done = 0;
- newsk->pair = NULL;
- atomic_set(&newsk->wmem_alloc, 0);
- atomic_set(&newsk->rmem_alloc, 0);
- newsk->localroute = sk->localroute;
-
- newsk->err = 0;
- newsk->shutdown = 0;
- newsk->ack_backlog = 0;
-
- newtp->fin_seq = req->rcv_isn;
- newsk->syn_seq = req->rcv_isn;
- newsk->state = TCP_SYN_RECV;
- newsk->timeout = 0;
-
- newsk->write_seq = req->snt_isn;
-
- newtp->snd_wnd = ntohs(skb->h.th->window);
- newtp->max_window = newtp->snd_wnd;
- newtp->snd_wl1 = req->rcv_isn;
- newtp->snd_wl2 = newsk->write_seq;
- newtp->snd_una = newsk->write_seq++;
- newtp->snd_nxt = newsk->write_seq;
-
- newsk->urg_data = 0;
- newtp->packets_out = 0;
- newtp->retransmits = 0;
- newsk->linger=0;
- newsk->destroy = 0;
- init_timer(&newsk->timer);
- newsk->timer.data = (unsigned long) newsk;
- newsk->timer.function = &net_timer;
-
- tcp_init_xmit_timers(newsk);
-
- newsk->dummy_th.source = sk->dummy_th.source;
- newsk->dummy_th.dest = req->rmt_port;
- newsk->sock_readers=0;
-
- newtp->rcv_nxt = req->rcv_isn + 1;
- newtp->rcv_wup = req->rcv_isn + 1;
- newsk->copied_seq = req->rcv_isn + 1;
-
- newsk->socket = NULL;
+ np = &newsk->net_pinfo.af_inet6;
ipv6_addr_copy(&np->daddr, &req->af.v6_req.rmt_addr);
ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr);
ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr);
@@ -987,14 +988,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ip6_dst_store(newsk, dst);
- newtp->sack_ok = req->sack_ok;
newtp->tstamp_ok = req->tstamp_ok;
- newtp->snd_wscale = req->snd_wscale;
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_wnd = req->rcv_wnd;
newtp->wscale_ok = req->wscale_ok;
- newtp->ts_recent = req->ts_recent;
+ if (newtp->wscale_ok) {
+ newtp->snd_wscale = req->snd_wscale;
+ newtp->rcv_wscale = req->rcv_wscale;
+ } else {
+ newtp->snd_wscale = newtp->rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp,65535);
+ }
if (newtp->tstamp_ok) {
- newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define the contant. */
- newsk->dummy_th.doff += 3;
+ newtp->ts_recent = req->ts_recent;
+ newtp->ts_recent_stamp = jiffies;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
} else {
newtp->tcp_header_len = sizeof(struct tcphdr);
}
@@ -1006,7 +1015,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->mss = min(req->mss+sizeof(struct tcphdr)-newtp->tcp_header_len,
(newsk->mtu - sizeof(struct ipv6hdr) - newtp->tcp_header_len));
- /* XXX tp->window_clamp??? -DaveM */
newsk->daddr = LOOPBACK4_IPV6;
newsk->saddr = LOOPBACK4_IPV6;
@@ -1181,12 +1189,14 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
goto no_tcp_socket;
}
- skb->sk = sk;
skb->seq = ntohl(th->seq);
skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
skb->ack_seq = ntohl(th->ack_seq);
-
skb->used = 0;
+ if(sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
+ skb->sk = sk;
}
/*
@@ -1249,6 +1259,12 @@ discard_it:
kfree_skb(skb);
return 0;
+
+do_time_wait:
+ if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, &(IPCB(skb)->opt), skb->len))
+ goto no_tcp_socket;
+ goto discard_it;
}
static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb)
@@ -1384,51 +1400,34 @@ static struct tcp_func ipv6_mapped = {
sizeof(struct sockaddr_in6)
};
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
static int tcp_v6_init_sock(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- skb_queue_head_init(&sk->out_of_order_queue);
+ skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
- tp->srtt = 0;
tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
tp->mdev = TCP_TIMEOUT_INIT;
-
- tp->ato = 0;
- tp->iat = (HZ/5) << 3;
-
- /* FIXME: right thing? */
- tp->rcv_wnd = 0;
tp->in_mss = 536;
- /* tp->rcv_wnd = 8192; */
- tp->tstamp_ok = 0;
- tp->sack_ok = 0;
- tp->wscale_ok = 0;
- tp->snd_wscale = 0;
- tp->sacks = 0;
- tp->saw_tstamp = 0;
- tp->syn_backlog = 0;
-
- /* start with only sending one packet at a time. */
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
tp->snd_cwnd = 1;
tp->snd_ssthresh = 0x7fffffff;
-
-
sk->priority = 1;
sk->state = TCP_CLOSE;
-
sk->max_ack_backlog = SOMAXCONN;
-
sk->mtu = 576;
sk->mss = 536;
-
sk->dummy_th.doff = sizeof(sk->dummy_th)/4;
- /*
- * Speed up by setting some standard state for the dummy_th.
- */
+ /* Speed up by setting some standard state for the dummy_th. */
sk->dummy_th.ack=1;
sk->dummy_th.doff=sizeof(struct tcphdr)>>2;
@@ -1442,6 +1441,7 @@ static int tcp_v6_init_sock(struct sock *sk)
static int tcp_v6_destroy_sock(struct sock *sk)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
tcp_clear_xmit_timers(sk);
@@ -1460,15 +1460,22 @@ static int tcp_v6_destroy_sock(struct sock *sk)
* Cleans up our, hopefuly empty, out_of_order_queue
*/
- while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
+ while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL)
kfree_skb(skb);
/*
* Release destination entry
*/
- dst_release(sk->dst_cache);
- sk->dst_cache = NULL;
+ dst_release(xchg(&sk->dst_cache,NULL));
+
+ /* Clean up a locked TCP bind bucket, this only happens if a
+ * port is allocated for a socket, but it never fully connects.
+ * In which case we will find num to be non-zero and daddr to
+ * be zero.
+ */
+ if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0)
+ tcp_bucket_unlock(sk);
return 0;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b99dc19e3..40e9b0233 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -7,7 +7,7 @@
*
* Based on linux/ipv4/udp.c
*
- * $Id: udp.c,v 1.21 1997/12/29 19:52:52 kuznet Exp $
+ * $Id: udp.c,v 1.24 1998/03/12 03:20:21 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -448,32 +448,43 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
return NULL;
}
+/*
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
+ */
static void udpv6_mcast_deliver(struct udphdr *uh,
struct in6_addr *saddr, struct in6_addr *daddr,
struct sk_buff *skb)
{
struct sock *sk, *sk2;
+ struct sk_buff *buff;
- SOCKHASH_LOCK();
sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr);
- if(sk) {
- sk2 = sk;
- while((sk2 = udp_v6_mcast_next(sk2->next,
- uh->dest, saddr,
- uh->source, daddr))) {
- struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
- if (buff && sock_queue_rcv_skb(sk2, buff) < 0) {
- buff->sk = NULL;
- kfree_skb(buff);
- }
+ if (!sk)
+ goto free_skb;
+
+ buff = NULL;
+ sk2 = sk;
+ while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr,
+ uh->source, daddr))) {
+ if (!buff) {
+ buff = skb_clone(skb, GFP_ATOMIC);
+ if (!buff)
+ continue;
}
+ if (sock_queue_rcv_skb(sk2, buff) >= 0)
+ buff = NULL;
+ }
+ if (buff) {
+ buff->sk = NULL;
+ kfree_skb(buff);
}
- if(!sk || sock_queue_rcv_skb(sk, skb) < 0) {
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ free_skb:
skb->sk = NULL;
kfree_skb(skb);
}
- SOCKHASH_UNLOCK();
}
int udpv6_rcv(struct sk_buff *skb, struct device *dev,
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index cf56df492..904fa1174 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1713,7 +1713,7 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname,
static int ipx_create(struct socket *sock, int protocol)
{
struct sock *sk;
- sk=sk_alloc(AF_IPX, GFP_KERNEL);
+ sk=sk_alloc(AF_IPX, GFP_KERNEL, 1);
if(sk==NULL)
return(-ENOMEM);
switch(sock->type)
diff --git a/net/netbeui/af_netbeui.c b/net/netbeui/af_netbeui.c
index 85bd8f4d1..6769edde5 100644
--- a/net/netbeui/af_netbeui.c
+++ b/net/netbeui/af_netbeui.c
@@ -150,7 +150,7 @@ static int netbeui_listen(struct socket *sock, int backlog)
static int netbeui_create(struct socket *sock, int protocol)
{
netbeui_socket *sk;
- sk=(netbeui_socket *)sk_alloc(GFP_KERNEL);
+ sk=(netbeui_socket *)sk_alloc(GFP_KERNEL, 1);
if(sk==NULL)
return(-ENOBUFS);
switch(sock->type)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3f02f4c3c..8b8e5a4b8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -147,7 +147,7 @@ static int netlink_create(struct socket *sock, int protocol)
sock->ops = &netlink_ops;
- sk = sk_alloc(AF_NETLINK, GFP_KERNEL);
+ sk = sk_alloc(AF_NETLINK, GFP_KERNEL, 1);
if (!sk)
return -ENOMEM;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index a84d1fd53..9d8a206da 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -98,7 +98,7 @@ static struct sock *nr_alloc_sock(void)
struct sock *sk;
nr_cb *nr;
- if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC)) == NULL)
+ if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC, 1)) == NULL)
return NULL;
if ((nr = kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) {
@@ -759,6 +759,8 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
newsk = skb->sk;
newsk->pair = NULL;
+ newsk->socket = newsock;
+ newsk->sleep = &newsock->wait;
sti();
/* Now attach up the new socket */
diff --git a/net/netsyms.c b/net/netsyms.c
index b7809863b..ad51e9a3e 100644
--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -244,7 +244,6 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
EXPORT_SYMBOL(__release_sock);
EXPORT_SYMBOL(net_timer);
/* UDP/TCP exported functions for TCPv6 */
-EXPORT_SYMBOL(sysctl_tcp_sack);
EXPORT_SYMBOL(sysctl_tcp_timestamps);
EXPORT_SYMBOL(sysctl_tcp_window_scaling);
EXPORT_SYMBOL(sock_rspace);
@@ -272,11 +271,15 @@ EXPORT_SYMBOL(tcp_slt_array);
EXPORT_SYMBOL(__tcp_inc_slow_timer);
EXPORT_SYMBOL(tcp_statistics);
EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_SYMBOL(tcp_timewait_state_process);
EXPORT_SYMBOL(tcp_do_sendmsg);
EXPORT_SYMBOL(tcp_v4_build_header);
EXPORT_SYMBOL(tcp_v4_rebuild_header);
EXPORT_SYMBOL(tcp_v4_send_check);
EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_SYMBOL(tcp_create_openreq_child);
+EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(tcp_bucket_unlock);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
EXPORT_SYMBOL(tcp_v4_do_rcv);
EXPORT_SYMBOL(tcp_v4_connect);
@@ -290,6 +293,11 @@ EXPORT_SYMBOL(ipv4_specific);
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(xrlim_allow);
+
+EXPORT_SYMBOL(tcp_write_xmit);
+EXPORT_SYMBOL(dev_loopback_xmit);
+EXPORT_SYMBOL(tcp_regs);
+
#endif
#ifdef CONFIG_NETLINK
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a098f59b9..74fc7af82 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -35,6 +35,7 @@
* Alan Cox : sendmsg/recvmsg support.
* Alan Cox : Protocol setting support
* Alexey Kuznetsov : Untied from IPv4 stack.
+ * Cyrus Durgin : Fixed kerneld for kmod.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -54,7 +55,7 @@
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/wireless.h>
-#include <linux/kerneld.h>
+#include <linux/kmod.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -710,7 +711,7 @@ static int packet_create(struct socket *sock, int protocol)
sock->state = SS_UNCONNECTED;
MOD_INC_USE_COUNT;
- sk = sk_alloc(AF_PACKET, GFP_KERNEL);
+ sk = sk_alloc(AF_PACKET, GFP_KERNEL, 1);
if (sk == NULL) {
MOD_DEC_USE_COUNT;
return -ENOBUFS;
@@ -831,9 +832,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len,
/* We can't use skb_copy_datagram here */
err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
- if (err)
+ if (err) {
+ err = -EFAULT;
goto out_free;
-
+ }
sk->stamp=skb->stamp;
if (msg->msg_name)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index eeb396350..a575402c7 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -149,7 +149,7 @@ static struct sock *rose_alloc_sock(void)
struct sock *sk;
rose_cb *rose;
- if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC)) == NULL)
+ if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC, 1)) == NULL)
return NULL;
if ((rose = kmalloc(sizeof(*rose), GFP_ATOMIC)) == NULL) {
@@ -847,6 +847,8 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
newsk = skb->sk;
newsk->pair = NULL;
+ newsk->socket = newsock;
+ newsk->sleep = &newsock->wait;
sti();
/* Now attach up the new socket */
diff --git a/net/socket.c b/net/socket.c
index 5c9534031..dc77ef3e8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -76,8 +76,8 @@
#include <linux/init.h>
#include <linux/poll.h>
-#if defined(CONFIG_KERNELD) && defined(CONFIG_NET)
-#include <linux/kerneld.h>
+#if defined(CONFIG_KMOD) && defined(CONFIG_NET)
+#include <linux/kmod.h>
#endif
#include <asm/system.h>
@@ -577,7 +577,7 @@ int sock_create(int family, int type, int protocol, struct socket **res)
if(family<0||family>=NPROTO)
return -EINVAL;
-#if defined(CONFIG_KERNELD) && defined(CONFIG_NET)
+#if defined(CONFIG_KMOD) && defined(CONFIG_NET)
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
@@ -814,7 +814,7 @@ restart:
newsock = socki_lookup(inode);
if ((err = get_fd(inode)) < 0)
- goto out_inval;
+ goto out_release;
newsock->file = current->files->fd[err];
if (upeer_sockaddr)
@@ -835,8 +835,6 @@ out:
unlock_kernel();
return err;
-out_inval:
- err = -EINVAL;
out_release:
sock_release(newsock);
goto out_put;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2fbce16fe..b04072d80 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -364,7 +364,7 @@ static int unix_create(struct socket *sock, int protocol)
default:
return -ESOCKTNOSUPPORT;
}
- sk = sk_alloc(AF_UNIX, GFP_KERNEL);
+ sk = sk_alloc(AF_UNIX, GFP_KERNEL, 1);
if (!sk)
return -ENOMEM;
@@ -1265,7 +1265,9 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size
}
chunk = min(skb->len, size);
- /* N.B. This could fail with -EFAULT */
+ /* N.B. This could fail with a non-zero value (which means -EFAULT
+ * and the non-zero value is the number of bytes not copied).
+ */
memcpy_toiovec(msg->msg_iov, skb->data, chunk);
copied += chunk;
size -= chunk;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 7e3c9cae2..a85aeea5f 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -423,7 +423,7 @@ static struct sock *x25_alloc_socket(void)
struct sock *sk;
x25_cb *x25;
- if ((sk = sk_alloc(AF_X25, GFP_ATOMIC)) == NULL)
+ if ((sk = sk_alloc(AF_X25, GFP_ATOMIC, 1)) == NULL)
return NULL;
if ((x25 = kmalloc(sizeof(*x25), GFP_ATOMIC)) == NULL) {