summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-02-18 00:24:27 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-02-18 00:24:27 +0000
commitb9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4
parentb3ac367c7a3e6047abe74817db27e34e759f279f (diff)
Merge with Linux 2.3.41.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c261
-rw-r--r--net/ipv4/arp.c10
-rw-r--r--net/ipv4/ip_input.c13
-rw-r--r--net/ipv4/ip_output.c12
-rw-r--r--net/ipv4/ip_sockglue.c4
-rw-r--r--net/ipv4/ipconfig.c9
-rw-r--r--net/ipv4/proc.c16
-rw-r--r--net/ipv4/raw.c8
-rw-r--r--net/ipv4/route.c49
-rw-r--r--net/ipv4/syncookies.c34
-rw-r--r--net/ipv4/sysctl_net_ipv4.c34
-rw-r--r--net/ipv4/tcp.c1048
-rw-r--r--net/ipv4/tcp_input.c1370
-rw-r--r--net/ipv4/tcp_ipv4.c951
-rw-r--r--net/ipv4/tcp_output.c495
-rw-r--r--net/ipv4/tcp_timer.c648
-rw-r--r--net/ipv4/udp.c182
17 files changed, 2977 insertions, 2167 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 94fb19f92..bc2c97779 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.101 2000/01/09 02:19:38 davem Exp $
+ * Version: $Id: af_inet.c,v 1.104 2000/01/18 08:24:14 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -117,7 +117,9 @@
struct linux_mib net_statistics[NR_CPUS*2];
+#ifdef INET_REFCNT_DEBUG
atomic_t inet_sock_nr;
+#endif
extern int raw_get_info(char *, char **, off_t, int);
extern int snmp_get_info(char *, char **, off_t, int);
@@ -159,8 +161,8 @@ void inet_sock_destruct(struct sock *sk)
if (sk->protinfo.af_inet.opt)
kfree(sk->protinfo.af_inet.opt);
dst_release(sk->dst_cache);
- atomic_dec(&inet_sock_nr);
#ifdef INET_REFCNT_DEBUG
+ atomic_dec(&inet_sock_nr);
printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", sk, atomic_read(&inet_sock_nr));
#endif
}
@@ -171,32 +173,28 @@ void inet_sock_release(struct sock *sk)
sk->prot->destroy(sk);
/* Observation: when inet_sock_release is called, processes have
- no access to socket. But net still has.
- Step one, detach it from networking:
-
- A. Remove from hash tables.
+ * no access to socket. But net still has.
+ * Step one, detach it from networking:
+ *
+ * A. Remove from hash tables.
*/
sk->prot->unhash(sk);
/* In this point socket cannot receive new packets,
- but it is possible that some packets are in flight
- because some CPU runs receiver and did hash table lookup
- before we unhashed socket. They will achieve receive queue
- and will be purged by socket destructor.
-
- Also we still have packets pending on receive
- queue and probably, our own packets waiting in device queues.
- sock_destroy will drain receive queue, but transmitted
- packets will delay socket destruction until the last reference
- will be released.
+ * but it is possible that some packets are in flight
+ * because some CPU runs receiver and did hash table lookup
+ * before we unhashed socket. They will achieve receive queue
+ * and will be purged by socket destructor.
+ *
+ * Also we still have packets pending on receive
+ * queue and probably, our own packets waiting in device queues.
+ * sock_destroy will drain receive queue, but transmitted
+ * packets will delay socket destruction until the last reference
+ * will be released.
*/
- write_lock_irq(&sk->callback_lock);
- sk->dead=1;
- sk->socket = NULL;
- sk->sleep = NULL;
- write_unlock_irq(&sk->callback_lock);
+ sock_orphan(sk);
#ifdef INET_REFCNT_DEBUG
if (atomic_read(&sk->refcnt) != 1) {
@@ -222,8 +220,7 @@ int inet_setsockopt(struct socket *sock, int level, int optname,
char *optval, int optlen)
{
struct sock *sk=sock->sk;
- if (sk->prot->setsockopt==NULL)
- return -EOPNOTSUPP;
+
return sk->prot->setsockopt(sk,level,optname,optval,optlen);
}
@@ -239,8 +236,7 @@ int inet_getsockopt(struct socket *sock, int level, int optname,
char *optval, int *optlen)
{
struct sock *sk=sock->sk;
- if (sk->prot->getsockopt==NULL)
- return -EOPNOTSUPP;
+
return sk->prot->getsockopt(sk,level,optname,optval,optlen);
}
@@ -264,14 +260,6 @@ static int inet_autobind(struct sock *sk)
return 0;
}
-/* Listening INET sockets never sleep to wait for memory, so
- * it is completely silly to wake them up on queue space
- * available events. So we hook them up to this dummy callback.
- */
-static void inet_listen_write_space(struct sock *sk)
-{
-}
-
/*
* Move a socket into listening state.
*/
@@ -282,12 +270,13 @@ int inet_listen(struct socket *sock, int backlog)
unsigned char old_state;
int err;
+ lock_sock(sk);
+
+ err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
- return -EINVAL;
+ goto out;
- lock_sock(sk);
old_state = sk->state;
- err = -EINVAL;
if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
goto out;
@@ -295,25 +284,9 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- sk->state = TCP_LISTEN;
- sk->ack_backlog = 0;
- if (sk->num == 0) {
- if (sk->prot->get_port(sk, 0) != 0) {
- sk->state = old_state;
- err = -EAGAIN;
- goto out;
- }
- sk->sport = htons(sk->num);
- } else {
- /* Not nice, but the simplest solution however */
- if (sk->prev)
- ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
- }
-
- sk_dst_reset(sk);
- sk->prot->hash(sk);
- sk->socket->flags |= SO_ACCEPTCON;
- sk->write_space = inet_listen_write_space;
+ err = tcp_listen_start(sk);
+ if (err)
+ goto out;
}
sk->max_ack_backlog = backlog;
err = 0;
@@ -345,10 +318,6 @@ static int inet_create(struct socket *sock, int protocol)
if (protocol && protocol != IPPROTO_TCP)
goto free_and_noproto;
protocol = IPPROTO_TCP;
- if (ipv4_config.no_pmtu_disc)
- sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
- else
- sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
prot = &tcp_prot;
sock->ops = &inet_stream_ops;
break;
@@ -359,7 +328,6 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_noproto;
protocol = IPPROTO_UDP;
sk->no_check = UDP_CSUM_DEFAULT;
- sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
prot=&udp_prot;
sock->ops = &inet_dgram_ops;
break;
@@ -370,7 +338,6 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_noproto;
prot = &raw_prot;
sk->reuse = 1;
- sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
sk->num = protocol;
sock->ops = &inet_dgram_ops;
if (protocol == IPPROTO_RAW)
@@ -380,23 +347,22 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_badtype;
}
+ if (ipv4_config.no_pmtu_disc)
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
+ else
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
+
sock_init_data(sock,sk);
sk->destruct = inet_sock_destruct;
- sk->zapped=0;
-#ifdef CONFIG_TCP_NAGLE_OFF
- sk->nonagle = 1;
-#endif
+ sk->zapped = 0;
sk->family = PF_INET;
sk->protocol = protocol;
sk->prot = prot;
sk->backlog_rcv = prot->backlog_rcv;
- sk->timer.data = (unsigned long)sk;
- sk->timer.function = &tcp_keepalive_timer;
-
sk->protinfo.af_inet.ttl=sysctl_ip_default_ttl;
sk->protinfo.af_inet.mc_loop=1;
@@ -404,7 +370,9 @@ static int inet_create(struct socket *sock, int protocol)
sk->protinfo.af_inet.mc_index=0;
sk->protinfo.af_inet.mc_list=NULL;
+#ifdef INET_REFCNT_DEBUG
atomic_inc(&inet_sock_nr);
+#endif
if (sk->num) {
/* It assumes that any protocol which allows
@@ -469,11 +437,8 @@ int inet_release(struct socket *sock)
* linger..
*/
timeout = 0;
- if (sk->linger && !(current->flags & PF_EXITING)) {
- timeout = HZ * sk->lingertime;
- if (!timeout)
- timeout = MAX_SCHEDULE_TIMEOUT;
- }
+ if (sk->linger && !(current->flags & PF_EXITING))
+ timeout = sk->lingertime;
sock->sk = NULL;
sk->prot->close(sk, timeout);
}
@@ -496,10 +461,6 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
- if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
- return -EADDRNOTAVAIL; /* Source address MUST be ours! */
- }
snum = ntohs(addr->sin_port);
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
@@ -555,25 +516,29 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
return sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
}
-static void inet_wait_for_connect(struct sock *sk)
+static long inet_wait_for_connect(struct sock *sk, long timeo)
{
DECLARE_WAITQUEUE(wait, current);
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(sk->sleep, &wait);
+ /* Basic assumption: if someone sets sk->err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
- if (signal_pending(current))
- break;
- if (sk->err)
- break;
release_sock(sk);
- schedule();
+ timeo = schedule_timeout(timeo);
lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(sk->sleep, &wait);
+ return timeo;
}
/*
@@ -586,16 +551,16 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
{
struct sock *sk=sock->sk;
int err;
+ long timeo;
+
+ lock_sock(sk);
if (uaddr->sa_family == AF_UNSPEC) {
- lock_sock(sk);
err = sk->prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- release_sock(sk);
- return err;
+ goto out;
}
- lock_sock(sk);
switch (sock->state) {
default:
err = -EINVAL;
@@ -604,40 +569,58 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- if (tcp_established(sk->state)) {
- sock->state = SS_CONNECTED;
- err = 0;
- goto out;
- }
- if (sk->err)
- goto sock_error;
err = -EALREADY;
- if (flags & O_NONBLOCK)
- goto out;
+ /* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->state != TCP_CLOSE)
+ goto out;
+
+ err = -EAGAIN;
+ if (sk->num == 0) {
+ if (sk->prot->get_port(sk, 0) != 0)
+ goto out;
+ sk->sport = htons(sk->num);
+ }
+
err = sk->prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;
+
sock->state = SS_CONNECTING;
- }
- if (sk->state > TCP_FIN_WAIT2)
- goto sock_error;
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
- err = -EINPROGRESS;
- if (!tcp_established(sk->state) && (flags & O_NONBLOCK))
- goto out;
+ timeo = sock_sndtimeo(sk, flags&O_NONBLOCK);
if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
- inet_wait_for_connect(sk);
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo))
+ goto out;
+
err = -ERESTARTSYS;
if (signal_pending(current))
goto out;
}
- if (sk->err && !tcp_established(sk->state))
- goto sock_error;
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
sock->state = SS_CONNECTED;
err = 0;
out:
@@ -647,11 +630,9 @@ out:
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
- if (sk->prot->disconnect(sk, O_NONBLOCK))
+ if (sk->prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
- release_sock(sk);
-
- return err;
+ goto out;
}
/*
@@ -671,11 +652,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));
- write_lock_irq(&sk2->callback_lock);
- sk2->sleep = &newsock->wait;
- newsock->sk = sk2;
- sk2->socket = newsock;
- write_unlock_irq(&sk2->callback_lock);
+ sock_graft(sk2, newsock);
newsock->state = SS_CONNECTED;
release_sock(sk2);
@@ -749,7 +726,7 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size,
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
- int err;
+ int err = 0;
/* This should really check to make sure
* the socket is a TCP socket. (WHY AC...)
@@ -759,35 +736,45 @@ int inet_shutdown(struct socket *sock, int how)
2->3 */
if ((how & ~SHUTDOWN_MASK) || how==0) /* MAXINT->0 */
return -EINVAL;
- if (!sk)
- return -ENOTCONN;
lock_sock(sk);
- if (sock->state == SS_CONNECTING && tcp_established(sk->state))
- sock->state = SS_CONNECTED;
- err = -ENOTCONN;
- if (!tcp_connected(sk->state))
- goto out;
- sk->shutdown |= how;
- if (sk->prot->shutdown)
- sk->prot->shutdown(sk, how);
+ if (sock->state == SS_CONNECTING) {
+ if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->state) {
+ default:
+ sk->shutdown |= how;
+ if (sk->prot->shutdown)
+ sk->prot->shutdown(sk, how);
+ break;
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
/* Wake up anyone sleeping in poll. */
sk->state_change(sk);
- err = 0;
-out:
release_sock(sk);
return err;
}
-unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait)
-{
- struct sock *sk = sock->sk;
-
- if (sk->prot->poll == NULL)
- return(0);
- return sk->prot->poll(file, sock, wait);
-}
-
/*
* ioctl() calls you can issue on an INET socket. Most of these are
* device configuration and stuff and very rarely used. Some ioctls
@@ -909,7 +896,7 @@ struct proto_ops inet_stream_ops = {
sock_no_socketpair,
inet_accept,
inet_getname,
- inet_poll,
+ tcp_poll,
inet_ioctl,
inet_listen,
inet_shutdown,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 591f3cceb..588cdf030 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.83 1999/12/15 22:39:03 davem Exp $
+ * Version: $Id: arp.c,v 1.84 2000/01/18 08:24:14 davem Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -487,7 +487,9 @@ void arp_send(int type, int ptype, u32 dest_ip,
/*
* Fill the device header for the ARP frame
*/
- dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len);
+ if (dev->hard_header &&
+ dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+ goto out;
/*
* Fill out the arp protocol part.
@@ -552,6 +554,10 @@ void arp_send(int type, int ptype, u32 dest_ip,
skb->dev = dev;
dev_queue_xmit(skb);
+ return;
+
+out:
+ kfree_skb(skb);
}
static void parp_redo(struct sk_buff *skb)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 11a8c319b..23389d249 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.44 2000/01/09 02:19:30 davem Exp $
+ * Version: $Id: ip_input.c,v 1.45 2000/01/16 05:11:22 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -317,13 +317,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
#ifdef CONFIG_NET_CLS_ROUTE
if (skb->dst->tclassid) {
+ struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
u32 idx = skb->dst->tclassid;
- write_lock(&ip_rt_acct_lock);
- ip_rt_acct[idx&0xFF].o_packets++;
- ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
- ip_rt_acct[(idx>>16)&0xFF].i_packets++;
- ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
- write_unlock(&ip_rt_acct_lock);
+ st[idx&0xFF].o_packets++;
+ st[idx&0xFF].o_bytes+=skb->len;
+ st[(idx>>16)&0xFF].i_packets++;
+ st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 59e6ff865..2a4e3cf41 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.77 2000/01/09 02:19:31 davem Exp $
+ * Version: $Id: ip_output.c,v 1.78 2000/01/16 05:11:22 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -149,8 +149,8 @@ output_maybe_reroute(struct sk_buff *skb)
/*
* Add an ip header to a skbuff and send it out.
*/
-void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
- u32 saddr, u32 daddr, struct ip_options *opt)
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ u32 saddr, u32 daddr, struct ip_options *opt)
{
struct rtable *rt = (struct rtable *)skb->dst;
struct iphdr *iph;
@@ -182,8 +182,8 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
ip_send_check(iph);
/* Send it out. */
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- output_maybe_reroute);
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ output_maybe_reroute);
}
static inline int ip_finish_output2(struct sk_buff *skb)
@@ -257,7 +257,7 @@ int ip_mc_output(struct sk_buff *skb)
{
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
newskb->dev,
ip_dev_loopback_xmit);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c618689b2..90b74447f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
*
* The IP to API glue.
*
- * Version: $Id: ip_sockglue.c,v 1.46 2000/01/09 02:19:32 davem Exp $
+ * Version: $Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $
*
* Authors: see ip.c
*
@@ -415,7 +415,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (sk->family == PF_INET ||
- ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+ (!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE))
&& sk->daddr != LOOPBACK4_IPV6)) {
#endif
if (opt)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 4d2195312..d4d556cb5 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -534,7 +534,14 @@ static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies)
/* Construct BOOTP header */
b->op = BOOTP_REQUEST;
- b->htype = dev->type;
+ if (dev->type < 256) /* check for false types */
+ b->htype = dev->type;
+ else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+ b->htype = ARPHRD_IEEE802;
+ else {
+ printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ b->htype = dev->type; /* can cause undefined behavior */
+ }
b->hlen = dev->addr_len;
memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
b->secs = htons(jiffies / HZ);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b3e86f58c..d6a7c57f5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
* PROC file system. It is mainly used for debugging and
* statistics.
*
- * Version: $Id: proc.c,v 1.38 2000/01/09 02:19:30 davem Exp $
+ * Version: $Id: proc.c,v 1.41 2000/01/21 23:45:57 davem Exp $
*
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -71,8 +71,9 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length)
int len = socket_get_info(buffer,start,offset,length);
- len += sprintf(buffer+len,"TCP: inuse %d\n",
- fold_prot_inuse(&tcp_prot));
+ len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d\n",
+ fold_prot_inuse(&tcp_prot),
+ atomic_read(&tcp_orphan_count), tcp_tw_count);
len += sprintf(buffer+len,"UDP: inuse %d\n",
fold_prot_inuse(&udp_prot));
len += sprintf(buffer+len,"RAW: inuse %d\n",
@@ -163,7 +164,14 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length)
len = sprintf(buffer,
"TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed"
" EmbryonicRsts PruneCalled RcvPruned OfoPruned"
- " OutOfWindowIcmps LockDroppedIcmps\n"
+ " OutOfWindowIcmps LockDroppedIcmps"
+ " TW TWRecycled TWKilled"
+ " PAWSPassive PAWSActive PAWSEstab"
+ " DelayedACKs DelayedACKLocked DelayedACKLost"
+ " ListenOverflows ListenDrops"
+ " TCPPrequeued TCPDirectCopyFromBacklog"
+ " TCPDirectCopyFromPrequeue TCPPrequeueDropped"
+ " TCPHPHits TCPHPHitsToUser\n"
"TcpExt:");
for (i=0; i<offsetof(struct linux_mib, __pad)/sizeof(unsigned long); i++)
len += sprintf(buffer+len, " %lu", fold_field((unsigned long*)net_statistics, sizeof(struct linux_mib), i));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6fc5e59c5..e9aa1952a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.46 2000/01/09 02:19:30 davem Exp $
+ * Version: $Id: raw.c,v 1.48 2000/01/18 08:24:15 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -648,10 +648,6 @@ struct proto raw_prot = {
udp_connect, /* connect */
udp_disconnect, /* disconnect */
NULL, /* accept */
- NULL, /* retransmit */
- NULL, /* write_wakeup */
- NULL, /* read_wakeup */
- datagram_poll, /* poll */
#ifdef CONFIG_IP_MROUTE
ipmr_ioctl, /* ioctl */
#else
@@ -669,7 +665,5 @@ struct proto raw_prot = {
raw_v4_hash, /* hash */
raw_v4_unhash, /* unhash */
NULL, /* get_port */
- 128, /* max_header */
- 0, /* retransmits */
"RAW", /* name */
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index add42730d..bbc6ec111 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.78 2000/01/13 00:06:58 davem Exp $
+ * Version: $Id: route.c,v 1.80 2000/01/21 06:37:27 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -1178,6 +1178,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->u.dst.output= ip_rt_bug;
atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
@@ -1385,6 +1386,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
goto e_nobufs;
atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
@@ -1462,6 +1464,7 @@ local_input:
rth->u.dst.output= ip_rt_bug;
atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
@@ -1815,6 +1818,7 @@ make_route:
goto e_nobufs;
atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
rth->key.dst = daddr;
rth->key.tos = tos;
rth->key.src = saddr;
@@ -2208,8 +2212,7 @@ ctl_table ipv4_route_table[] = {
#endif
#ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct ip_rt_acct[256];
-rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
+struct ip_rt_acct *ip_rt_acct;
#ifdef CONFIG_PROC_FS
static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
@@ -2217,14 +2220,34 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
{
*start=buffer;
- if (offset + length > sizeof(ip_rt_acct)) {
- length = sizeof(ip_rt_acct) - offset;
+ if ((offset&3) || (length&3))
+ return -EIO;
+
+ if (offset + length >= sizeof(struct ip_rt_acct)*256) {
+ length = sizeof(struct ip_rt_acct)*256 - offset;
*eof = 1;
}
if (length > 0) {
- read_lock_bh(&ip_rt_acct_lock);
- memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
- read_unlock_bh(&ip_rt_acct_lock);
+ u32 *dst = (u32*)buffer;
+ u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
+
+ memcpy(dst, src, length);
+
+#ifdef __SMP__
+ if (smp_num_cpus > 1) {
+ int i;
+ int cnt = length/4;
+
+ for (i=1; i<smp_num_cpus; i++) {
+ int k;
+
+ src += (256/4)*sizeof(struct ip_rt_acct);
+
+ for (k=0; k<cnt; k++)
+ dst[k] += src[k];
+ }
+ }
+#endif
return length;
}
return 0;
@@ -2236,6 +2259,16 @@ void __init ip_rt_init(void)
{
int i, order, goal;
+#ifdef CONFIG_NET_CLS_ROUTE
+ for (order=0;
+ (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*smp_num_cpus; order++)
+ /* NOTHING */;
+ ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+ if (!ip_rt_acct)
+ panic("IP: failed to allocate ip_rt_acct\n");
+ memset(ip_rt_acct, 0, PAGE_SIZE<<order);
+#endif
+
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
sizeof(struct rtable),
0, SLAB_HWCACHE_ALIGN,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e82233cfd..d218c3bdb 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * $Id: syncookies.c,v 1.10 2000/01/09 02:19:35 davem Exp $
+ * $Id: syncookies.c,v 1.11 2000/01/16 05:11:27 davem Exp $
*
* Missing: IPv6 support.
*/
@@ -102,23 +102,16 @@ static inline struct sock *
get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req,
struct dst_entry *dst)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sock *child;
- /* Oops! It was missing, syn_recv_sock decreases it. */
- tp->syn_backlog++;
+ child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+ if (child)
+ tcp_acceptq_queue(sk, req, child);
+ else
+ tcp_openreq_free(req);
- sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
- if (sk) {
- req->sk = sk;
-
- /* Queue up for accept() */
- tcp_synq_queue(tp, req);
- } else {
- tp->syn_backlog--;
- req->class->destructor(req);
- tcp_openreq_free(req);
- }
- return sk;
+ return child;
}
struct sock *
@@ -171,9 +164,9 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
}
}
}
-
+
req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
- req->wscale_ok = 0;
+ req->wscale_ok = req->sack_ok = 0;
req->expires = 0UL;
req->retrans = 0;
@@ -189,8 +182,8 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
req->af.v4_req.loc_addr,
sk->protinfo.af_inet.tos | RTO_CONN,
0)) {
- tcp_openreq_free(req);
- return NULL;
+ tcp_openreq_free(req);
+ return NULL;
}
/* Try to redo what tcp_v4_send_synack did. */
@@ -198,6 +191,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
tcp_select_initial_window(tcp_full_space(sk),req->mss,
&req->rcv_wnd, &req->window_clamp,
0, &rcv_wscale);
+ /* BTW win scale with syncookies is 0 by definition */
req->rcv_wscale = rcv_wscale;
return get_cookie_sock(sk, skb, req, &rt->u.dst);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9465e4021..d9416525b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,7 +1,7 @@
/*
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
*
- * $Id: sysctl_net_ipv4.c,v 1.42 2000/01/09 02:19:37 davem Exp $
+ * $Id: sysctl_net_ipv4.c,v 1.43 2000/01/16 05:11:27 davem Exp $
*
* Begun April 1, 1996, Mike Shaver.
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
@@ -41,26 +41,6 @@ extern int sysctl_ipfrag_time;
/* From ip_output.c */
extern int sysctl_ip_dynaddr;
-/* From ip_masq.c */
-extern int sysctl_ip_masq_debug;
-
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_retrans_collapse;
-extern int sysctl_tcp_keepalive_time;
-extern int sysctl_tcp_keepalive_probes;
-extern int sysctl_tcp_retries1;
-extern int sysctl_tcp_retries2;
-extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_syncookies;
-extern int sysctl_tcp_syn_retries;
-extern int sysctl_tcp_stdurg;
-extern int sysctl_tcp_rfc1337;
-extern int sysctl_tcp_syn_taildrop;
-extern int sysctl_max_syn_backlog;
-extern int sysctl_tcp_tw_recycle;
-
/* From icmp.c */
extern int sysctl_icmp_destunreach_time;
extern int sysctl_icmp_timeexceed_time;
@@ -142,6 +122,12 @@ ctl_table ipv4_table[] = {
&proc_dointvec},
{NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries",
&sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_SYNACK_RETRIES, "tcp_synack_retries",
+ &sysctl_tcp_synack_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_MAX_ORPHANS, "tcp_max_orphans",
+ &sysctl_tcp_max_orphans, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets",
+ &sysctl_tcp_max_tw_buckets, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh",
&sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh",
@@ -172,10 +158,10 @@ ctl_table ipv4_table[] = {
{NET_TCP_SYNCOOKIES, "tcp_syncookies",
&sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec},
#endif
-#ifdef CONFIG_TCP_TW_RECYCLE
{NET_TCP_TW_RECYCLE, "tcp_tw_recycle",
&sysctl_tcp_tw_recycle, sizeof(int), 0644, NULL, &proc_dointvec},
-#endif
+ {NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow",
+ &sysctl_tcp_abort_on_overflow, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
@@ -221,6 +207,8 @@ ctl_table ipv4_table[] = {
{NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime",
&inet_peer_gc_maxtime, sizeof(int), 0644, NULL,
&proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries",
+ &sysctl_tcp_orphan_retries, sizeof(int), 0644, NULL, &proc_dointvec},
{0}
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e24e19a4..e01892326 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.153 2000/01/09 02:19:33 davem Exp $
+ * Version: $Id: tcp.c,v 1.160 2000/01/24 18:40:32 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -202,6 +202,8 @@
* Eric Schenk : Fix fast close down bug with
* shutdown() followed by close().
* Andi Kleen : Make poll agree with SIGIO
+ * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
+ * lingertime == 0 (RFC 793 ABORT Call)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -432,113 +434,14 @@ kmem_cache_t *tcp_openreq_cachep;
kmem_cache_t *tcp_bucket_cachep;
kmem_cache_t *tcp_timewait_cachep;
-/*
- * Find someone to 'accept'. Must be called with
- * the listening socket locked.
- */
-
-static struct open_request *tcp_find_established(struct tcp_opt *tp,
- struct open_request **prevp)
-{
- struct open_request *req = tp->syn_wait_queue;
- struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
- while(req) {
- if (req->sk) {
- if((1 << req->sk->state) &
- ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
- break;
- }
- prev = req;
- req = req->dl_next;
- }
- *prevp = prev;
- return req;
-}
-
-/*
- * Walk down the receive queue counting readable data.
- *
- * Must be called with the socket lock held.
- */
-
-static int tcp_readable(struct sock *sk)
-{
- unsigned long counted;
- unsigned long amount;
- struct sk_buff *skb;
- int sum;
-
- SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
-
- skb = skb_peek(&sk->receive_queue);
- if (skb == NULL) {
- SOCK_DEBUG(sk, "empty\n");
- return(0);
- }
-
- counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */
- amount = 0;
-
- /* Do until a push or until we are out of data. */
- do {
- /* Found a hole so stops here. */
- if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */
- break;
-
- /* Length - header but start from where we are up to
- * avoid overlaps.
- */
- sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
- if (sum >= 0) {
- /* Add it up, move on. */
- amount += sum;
- counted += sum;
- if (skb->h.th->syn)
- counted++;
- }
-
- /* Don't count urg data ... but do it in the right place!
- * Consider: "old_data (ptr is here) URG PUSH data"
- * The old code would stop at the first push because
- * it counted the urg (amount==1) and then does amount--
- * *after* the loop. This means tcp_readable() always
- * returned zero if any URG PUSH was in the queue, even
- * though there was normal data available. If we subtract
- * the urg data right here, we even get it to work for more
- * than one URG PUSH skb without normal data.
- * This means that poll() finally works now with urg data
- * in the queue. Note that rlogin was never affected
- * because it doesn't use poll(); it uses two processes
- * and a blocking read(). And the queue scan in tcp_read()
- * was correct. Mike <pall@rz.uni-karlsruhe.de>
- */
-
- /* Don't count urg data. */
- if (skb->h.th->urg)
- amount--;
-#if 0
- if (amount && skb->h.th->psh) break;
-#endif
- skb = skb->next;
- } while(skb != (struct sk_buff *)&sk->receive_queue);
-
- SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
- return(amount);
-}
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
/*
* LISTEN is a special case for poll..
*/
-static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
{
- struct open_request *req, *dummy;
-
- lock_sock(sk);
- req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
- release_sock(sk);
- if (req)
- return POLLIN | POLLRDNORM;
- return 0;
+ return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
}
/*
@@ -585,9 +488,25 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
* if you don't tell them that something has hung up!
*
* Check-me.
+ *
+ * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+ * our fs/select.c). It means that after we received EOF,
+ * poll always returns immediately, making impossible poll() on write()
+ * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+ * if and only if shutdown has been made in both directions.
+ * Actually, it is interesting to look how Solaris and DUX
+ * solve this dilemma. I would prefer, if PULLHUP were maskable,
+ * then we could set it on SND_SHUTDOWN. BTW examples given
+ * in Stevens' books assume exactly this behaviour, it explains
+ * why PULLHUP is incompatible with POLLOUT. --ANK
+ *
+ * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+ * blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->shutdown & RCV_SHUTDOWN)
+ if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
mask |= POLLHUP;
+ if (sk->shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM;
/* Connected? */
if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
@@ -605,7 +524,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
}
}
- if (tp->urg_data & URG_VALID)
+ if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
return mask;
@@ -631,32 +550,48 @@ void tcp_write_space(struct sock *sk)
read_unlock(&sk->callback_lock);
}
+/* Listening TCP sockets never sleep to wait for memory, so
+ * it is completely silly to wake them up on queue space
+ * available events. So we hook them up to this dummy callback.
+ */
+static void tcp_listen_write_space(struct sock *sk)
+{
+}
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int answ;
switch(cmd) {
- case TIOCINQ:
-#ifdef FIXME /* FIXME: */
- case FIONREAD:
-#endif
+ case SIOCINQ:
if (sk->state == TCP_LISTEN)
return(-EINVAL);
+
lock_sock(sk);
- answ = tcp_readable(sk);
+ if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+ answ = 0;
+ else if (sk->urginline || !tp->urg_data ||
+ before(tp->urg_seq,tp->copied_seq) ||
+ !before(tp->urg_seq,tp->rcv_nxt))
+ answ = tp->rcv_nxt - tp->copied_seq;
+ else
+ answ = tp->urg_seq - tp->copied_seq;
release_sock(sk);
break;
case SIOCATMARK:
{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
break;
}
- case TIOCOUTQ:
+ case SIOCOUTQ:
if (sk->state == TCP_LISTEN)
return(-EINVAL);
- answ = sock_wspace(sk);
+
+ if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_una;
break;
default:
return(-ENOIOCTLCMD);
@@ -665,12 +600,131 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int *)arg);
}
+
+int tcp_listen_start(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_listen_opt *lopt;
+
+ sk->max_ack_backlog = 0;
+ sk->ack_backlog = 0;
+ tp->accept_queue = NULL;
+ tp->syn_wait_lock = RW_LOCK_UNLOCKED;
+
+ lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+ if (!lopt)
+ return -ENOMEM;
+
+ memset(lopt, 0, sizeof(struct tcp_listen_opt));
+ for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+ if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+ break;
+
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt = lopt;
+ write_unlock_bh(&tp->syn_wait_lock);
+
+ sk->state = TCP_LISTEN;
+ if (sk->num == 0) {
+ if (sk->prot->get_port(sk, 0) != 0) {
+ sk->state = TCP_CLOSE;
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt = NULL;
+ write_unlock_bh(&tp->syn_wait_lock);
+ kfree(lopt);
+ return -EAGAIN;
+ }
+ sk->sport = htons(sk->num);
+ } else {
+ if (sk->prev)
+ ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
+ }
+
+ sk_dst_reset(sk);
+ sk->prot->hash(sk);
+ sk->socket->flags |= SO_ACCEPTCON;
+ sk->write_space = tcp_listen_write_space;
+
+ return 0;
+}
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+
+static void tcp_listen_stop (struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ struct open_request *acc_req = tp->accept_queue;
+ struct open_request *req;
+ int i;
+
+ tcp_delete_keepalive_timer(sk);
+
+ /* make all the listen_opt local to us */
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt =NULL;
+ write_unlock_bh(&tp->syn_wait_lock);
+ tp->accept_queue = NULL;
+
+ if (lopt->qlen) {
+ for (i=0; i<TCP_SYNQ_HSIZE; i++) {
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ lopt->qlen--;
+ tcp_openreq_free(req);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ }
+ }
+ }
+ BUG_TRAP(lopt->qlen == 0);
+
+ kfree(lopt);
+
+ while ((req=acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ BUG_TRAP(child->lock.users==0);
+ sock_hold(child);
+
+ tcp_disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ atomic_inc(&tcp_orphan_count);
+
+ tcp_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ tcp_acceptq_removed(sk);
+ tcp_openreq_fastfree(req);
+ }
+ BUG_TRAP(sk->ack_backlog == 0);
+}
+
/*
* Wait for a socket to get into the connected state
*
* Note: Must be called with the socket locked.
*/
-static int wait_for_tcp_connect(struct sock * sk, int flags)
+static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
@@ -684,7 +738,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags)
send_sig(SIGPIPE, tsk, 0);
return -EPIPE;
}
- if(flags & MSG_DONTWAIT)
+ if(!*timeo_p)
return -EAGAIN;
if(signal_pending(tsk))
return -ERESTARTSYS;
@@ -694,7 +748,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags)
sk->tp_pinfo.af_tcp.write_pending++;
release_sock(sk);
- schedule();
+ *timeo_p = schedule_timeout(*timeo_p);
lock_sock(sk);
__set_task_state(tsk, TASK_RUNNING);
@@ -712,7 +766,7 @@ static inline int tcp_memory_free(struct sock *sk)
/*
* Wait for more memory for a socket
*/
-static void wait_for_tcp_memory(struct sock * sk)
+static long wait_for_tcp_memory(struct sock * sk, long timeo)
{
if (!tcp_memory_free(sk)) {
DECLARE_WAITQUEUE(wait, current);
@@ -732,12 +786,13 @@ static void wait_for_tcp_memory(struct sock * sk)
break;
release_sock(sk);
if (!tcp_memory_free(sk))
- schedule();
+ timeo = schedule_timeout(timeo);
lock_sock(sk);
}
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
}
+ return timeo;
}
/* When all user supplied data has been queued set the PSH bit */
@@ -746,11 +801,9 @@ static void wait_for_tcp_memory(struct sock * sk)
/*
* This routine copies from a user buffer into a socket,
* and starts the transmit system.
- *
- * Note: must be called with the socket locked.
*/
-int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
{
struct iovec *iov;
struct tcp_opt *tp;
@@ -758,15 +811,22 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
int iovlen, flags;
int mss_now;
int err, copied;
+ long timeo;
err = 0;
tp = &(sk->tp_pinfo.af_tcp);
- /* Wait for a connection to finish. */
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+
flags = msg->msg_flags;
+
+ timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. */
if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
- if((err = wait_for_tcp_connect(sk, flags)) != 0)
- goto out;
+ if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
+ goto out_unlock;
/* This should be in poll */
sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
@@ -777,7 +837,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
iovlen = msg->msg_iovlen;
iov = msg->msg_iov;
copied = 0;
-
+
while(--iovlen >= 0) {
int seglen=iov->iov_len;
unsigned char * from=iov->iov_base;
@@ -785,7 +845,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
iov++;
while(seglen > 0) {
- int copy, tmp, queue_it, psh;
+ int copy, tmp, queue_it;
if (err)
goto do_fault2;
@@ -811,8 +871,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
* welcome.
*/
if (skb_tailroom(skb) > 0 &&
- (mss_now - copy) > 0 &&
- tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
+ (mss_now - copy) > 0) {
int last_byte_was_odd = (copy % 4);
copy = mss_now - copy;
@@ -855,34 +914,17 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
}
}
- /* We also need to worry about the window. If
- * window < 1/2 the maximum window we've seen
- * from this host, don't use it. This is
- * sender side silly window prevention, as
- * specified in RFC1122. (Note that this is
- * different than earlier versions of SWS
- * prevention, e.g. RFC813.). What we
- * actually do is use the whole MSS. Since
- * the results in the right edge of the packet
- * being outside the window, it will be queued
- * for later rather than sent.
+ /* A chunk was here doing something strange
+ * with psh etc. It is deleted, because it was
+ * evident non-sense. --ANK
*/
- psh = 0;
- copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
- if(copy > (tp->max_window >> 1)) {
- copy = min(copy, mss_now);
- psh = 1;
- } else {
- copy = mss_now;
- }
- if(copy > seglen)
- copy = seglen;
+
+ copy = min(seglen, mss_now);
/* Determine how large of a buffer to allocate. */
- tmp = MAX_HEADER + sk->prot->max_header;
- if (copy < min(mss_now, tp->max_window >> 1) &&
- !(flags & MSG_OOB)) {
- tmp += min(mss_now, tp->max_window);
+ tmp = MAX_TCP_HEADER + 15;
+ if (copy < mss_now && !(flags & MSG_OOB)) {
+ tmp += mss_now;
/* What is happening here is that we want to
* tack on later members of the users iovec
@@ -901,7 +943,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
/* If we didn't get any memory, we need to sleep. */
if (skb == NULL) {
sk->socket->flags |= SO_NOSPACE;
- if (flags&MSG_DONTWAIT) {
+ if (!timeo) {
err = -EAGAIN;
goto do_interrupted;
}
@@ -909,8 +951,8 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
err = -ERESTARTSYS;
goto do_interrupted;
}
- tcp_push_pending_frames(sk, tp);
- wait_for_tcp_memory(sk);
+ __tcp_push_pending_frames(sk, tp, mss_now);
+ timeo = wait_for_tcp_memory(sk, timeo);
/* If SACK's were formed or PMTU events happened,
* we must find out about it.
@@ -923,7 +965,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
/* Prepare control bits for TCP header creation engine. */
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
- ((PSH_NEEDED || psh) ?
+ ((PSH_NEEDED) ?
TCPCB_FLAG_PSH : 0));
TCP_SKB_CB(skb)->sacked = 0;
if (flags & MSG_OOB) {
@@ -936,7 +978,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
* TCP+IP+DEV headers are SKB_PUSH()'d beneath.
* Reserve header space and checksum the data.
*/
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = csum_and_copy_from_user(from,
skb_put(skb, copy), copy, 0, &err);
@@ -950,7 +992,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
/* This advances tp->write_seq for us. */
- tcp_send_skb(sk, skb, queue_it);
+ tcp_send_skb(sk, skb, queue_it, mss_now);
}
}
sk->err = 0;
@@ -981,63 +1023,39 @@ do_fault:
do_fault2:
err = -EFAULT;
out:
- tcp_push_pending_frames(sk, tp);
+ __tcp_push_pending_frames(sk, tp, mss_now);
+ TCP_CHECK_TIMER(sk);
+out_unlock:
+ release_sock(sk);
return err;
}
#undef PSH_NEEDED
/*
- * Send an ack if one is backlogged at this point. Ought to merge
- * this with tcp_send_ack().
- * This is called for delayed acks also.
- */
-
-void tcp_read_wakeup(struct sock *sk)
-{
- /* If we're closed, don't send an ack, or we'll get a RST
- * from the closed destination.
- */
- if (sk->state != TCP_CLOSE)
- tcp_send_ack(sk);
-}
-
-/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
*/
-static int tcp_recv_urg(struct sock * sk, int nonblock,
+static int tcp_recv_urg(struct sock * sk, long timeo,
struct msghdr *msg, int len, int flags,
int *addr_len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* No URG data to read. */
- if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
+ if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->done)
return -ENOTCONN;
- if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
- sk->done = 1;
- return 0;
- }
-
- if (tp->urg_data & URG_VALID) {
+ if (tp->urg_data & TCP_URG_VALID) {
int err = 0;
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = URG_READ;
-
- if(msg->msg_name)
- tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
- msg->msg_name);
-
- if(addr_len)
- *addr_len = tp->af_specific->sockaddr_len;
+ tp->urg_data = TCP_URG_READ;
/* Read urgent data. */
msg->msg_flags|=MSG_OOB;
@@ -1051,6 +1069,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
return err ? -EFAULT : len;
}
+ /* Do not set sk->done, it is set only by normal data receive */
+ if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
+ return 0;
+
/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
* this call should never block, independent of the
@@ -1069,6 +1091,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
__skb_unlink(skb, &sk->receive_queue);
+ BUG_TRAP(atomic_read(&skb->users) == 1);
+ /* Well, if I missed something then punishment will be terrible oops. */
__kfree_skb(skb);
}
@@ -1080,22 +1104,34 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
*/
static void cleanup_rbuf(struct sock *sk, int copied)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
+ int time_to_ack;
/* NOTE! The socket must be locked, so that we don't get
* a messed-up receive queue.
*/
while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
- if (!skb->used || atomic_read(&skb->users) > 1)
+ if (!skb->used)
break;
tcp_eat_skb(sk, skb);
}
+ /* Delayed ACKs frequently hit locked sockets during bulk receive. */
+ time_to_ack = tp->ack.blocked && tp->ack.pending;
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+ if (tp->ack.pending &&
+ (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+ time_to_ack = 1;
+#endif
+
/* We send an ACK if we can now advertise a non-zero window
* which has been raised "significantly".
+ *
+ * Even if window raised up to infinity, do not send window open ACK
+ * in states, where we will not receive more. It is useless.
*/
- if(copied > 0) {
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
__u32 rcv_window_now = tcp_receive_window(tp);
__u32 new_window = __tcp_select_window(sk);
@@ -1106,16 +1142,20 @@ static void cleanup_rbuf(struct sock *sk, int copied)
* which don't advertize a larger window.
*/
if((new_window && (new_window >= rcv_window_now * 2)) &&
- ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
- tcp_read_wakeup(sk);
+ ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp))
+ time_to_ack = 1;
}
+ if (time_to_ack)
+ tcp_send_ack(sk);
}
/* Now socket state including sk->err is changed only under lock,
- hence we should check only pending signals.
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
*/
-static void tcp_data_wait(struct sock *sk)
+static long tcp_data_wait(struct sock *sk, long timeo)
{
DECLARE_WAITQUEUE(wait, current);
@@ -1127,17 +1167,39 @@ static void tcp_data_wait(struct sock *sk)
release_sock(sk);
if (skb_queue_empty(&sk->receive_queue))
- schedule();
+ timeo = schedule_timeout(timeo);
lock_sock(sk);
sk->socket->flags &= ~SO_WAITDATA;
remove_wait_queue(sk->sleep, &wait);
__set_current_state(TASK_RUNNING);
+ return timeo;
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
+
+ /* RX process wants to run with disabled BHs, though it is not necessary */
+ local_bh_disable();
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk->backlog_rcv(sk, skb);
+ local_bh_enable();
+
+ /* Clear memory counter. */
+ tp->ucopy.memory = 0;
}
/*
* This routine copies from a sock struct into the user buffer.
+ *
+ * Technical note: in 2.3 we work on _locked_ socket, so that
+ * tricks with *seq access order and skb->users are not required.
+ * Probably, code can be easily improved even more.
*/
int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
@@ -1146,13 +1208,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int copied = 0;
u32 peek_seq;
- volatile u32 *seq; /* So gcc doesn't overoptimise */
+ u32 *seq;
unsigned long used;
int err;
- int target = 1; /* Read at least this many bytes */
+ int target; /* Read at least this many bytes */
+ long timeo;
+ struct task_struct *user_recv = NULL;
lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+
+
if (sk->err)
goto out_err;
@@ -1160,24 +1227,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (sk->state == TCP_LISTEN)
goto out;
+ timeo = sock_rcvtimeo(sk, nonblock);
+
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg;
- /* Copying sequence to update. This is volatile to handle
- * the multi-reader case neatly (memcpy_to/fromfs might be
- * inline and thus not flush cached variables otherwise).
- */
- peek_seq = tp->copied_seq;
seq = &tp->copied_seq;
- if (flags & MSG_PEEK)
+ if (flags & MSG_PEEK) {
+ peek_seq = tp->copied_seq;
seq = &peek_seq;
+ }
- /* Handle the POSIX bogosity MSG_WAITALL. */
- if (flags & MSG_WAITALL)
- target=len;
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-
/*
* BUG BUG BUG
* This violates 1003.1g compliance. We must wait for
@@ -1200,7 +1263,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (copied)
break;
copied = -ERESTARTSYS;
- if (nonblock)
+ if (!timeo)
copied = -EAGAIN;
break;
}
@@ -1232,47 +1295,128 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
skb = skb->next;
} while (skb != (struct sk_buff *)&sk->receive_queue);
- if (copied >= target)
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && sk->backlog.tail == NULL)
break;
- if (sk->err && !(flags&MSG_PEEK)) {
- if (!copied)
+ if (copied) {
+ if (sk->err ||
+ sk->state == TCP_CLOSE ||
+ (sk->shutdown & RCV_SHUTDOWN) ||
+ !timeo)
+ break;
+ } else {
+ if (sk->err) {
copied = sock_error(sk);
- break;
- }
+ break;
+ }
- if (sk->shutdown & RCV_SHUTDOWN) {
- sk->done = 1;
- break;
- }
+ if (sk->done) {
+ copied = -ENOTCONN;
+ break;
+ }
- if (sk->state == TCP_CLOSE) {
- if (!sk->done) {
- sk->done = 1;
+ if (sk->state == TCP_CLOSE) {
+ if (!(flags&MSG_PEEK))
+ sk->done = 1;
break;
}
- if (!copied)
- copied = -ENOTCONN;
- break;
- }
- if (nonblock) {
- copied = -EAGAIN;
- break;
+ if (sk->shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
}
cleanup_rbuf(sk, copied);
- tcp_data_wait(sk);
+
+ if (tp->ucopy.task == user_recv) {
+ /* Install new reader */
+ if (user_recv == NULL && !(flags&MSG_PEEK)) {
+ user_recv = current;
+ tp->ucopy.task = user_recv;
+ tp->ucopy.iov = msg->msg_iov;
+ }
+
+ tp->ucopy.len = len;
+
+ BUG_TRAP(tp->copied_seq == tp->rcv_nxt);
+
+ /* Ugly... If prequeue is not empty, we have to
+ * process it before releasing socket, otherwise
+ * order will be broken at second iteration.
+ * More elegant solution is required!!!
+ *
+ * Look: we have the following (pseudo)queues:
+ *
+ * 1. packets in flight
+ * 2. backlog
+ * 3. prequeue
+ * 4. receive_queue
+ *
+ * Each queue can be processed only if the next ones
+ * are empty. At this point we have empty receive_queue.
+ * But prequeue _can_ be not empty after second iteration,
+ * when we jumped to start of loop because backlog
+ * processing added something to receive_queue.
+ * We cannot release_sock(), because backlog contains
+ * packets arrived _after_ prequeued ones.
+ *
+ * Shortly, algorithm is clear --- to process all
+ * the queues in order. We could make it more directly,
+ * requeueing packets from backlog to prequeue, if
+ * is not empty. It is more elegant, but eats cycles,
+ * unfortunately.
+ */
+ if (skb_queue_len(&tp->ucopy.prequeue))
+ goto do_prequeue;
+
+ /* __ Set realtime policy in scheduler __ */
+ }
+
+ if (copied >= target) {
+ /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else {
+ timeo = tcp_data_wait(sk, timeo);
+ }
+
+ if (user_recv) {
+ int chunk;
+
+ /* __ Restore normal policy in scheduler __ */
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
+ len -= chunk;
+ copied += chunk;
+ }
+
+ if (tp->rcv_nxt == tp->copied_seq &&
+ skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+ tcp_prequeue_process(sk);
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+ if (tp->ack.pending &&
+ (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+ tcp_send_ack(sk);
+#endif
+ }
continue;
found_ok_skb:
- /* Lock the buffer. We can be fairly relaxed as
- * an interrupt will never steal a buffer we are
- * using unless I've missed something serious in
- * tcp_data.
- */
- atomic_inc(&skb->users);
-
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
@@ -1293,36 +1437,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
}
}
- /* Copy it - We _MUST_ update *seq first so that we
- * don't ever double read when we have dual readers
- */
- *seq += used;
-
- /* This memcpy_toiovec can sleep. If it sleeps and we
- * do a second read it relies on the skb->users to avoid
- * a crash when cleanup_rbuf() gets called.
- */
err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
if (err) {
/* Exception. Bailout! */
- atomic_dec(&skb->users);
- copied = -EFAULT;
+ if (!copied)
+ copied = -EFAULT;
break;
}
+ *seq += used;
copied += used;
len -= used;
- /* We now will not sleep again until we are finished
- * with skb. Sorry if you are doing the SMP port
- * but you'll just have to fix it neatly ;)
- *
- * Very funny Alan... -DaveM
- */
- atomic_dec(&skb->users);
-
- if (after(tp->copied_seq,tp->urg_seq))
+ if (after(tp->copied_seq,tp->urg_seq)) {
tp->urg_data = 0;
+ if (skb_queue_len(&tp->out_of_order_queue) == 0
+#ifdef TCP_FORMAL_WINDOW
+ && tcp_receive_window(tp)
+#endif
+ ) {
+ tcp_fast_path_on(tp);
+ }
+ }
if (used + offset < skb->len)
continue;
@@ -1334,8 +1470,30 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (flags & MSG_PEEK)
continue;
skb->used = 1;
- if (atomic_read(&skb->users) == 1)
- tcp_eat_skb(sk, skb);
+ tcp_eat_skb(sk, skb);
+
+#ifdef CONFIG_TCP_LESS_COARSE_ACKS
+ /* Possible improvement. When sender is faster than receiver,
+ * traffic looks like: fill window ... wait for window open ...
+ * fill window. We lose at least one rtt, because call
+ * cleanup_rbuf only once. Probably, if "len" was large
+ * we should insert several intermediate cleanup_rbuf(s).
+ *
+ * F.e.:
+ */
+ do {
+ u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
+
+ /* Try to ACK, if total buffer length is larger
+ than maximal window and if rcv_window has
+ chances to increase twice. It will result
+ to exponentially decreased ACKing during
+ read to huge (usually, mmapped) buffer.
+ */
+ if (len >= full_space && tp->rcv_wnd <= full_space/2)
+ cleanup_rbuf(sk, copied);
+ } while (0);
+#endif
continue;
found_fin_ok:
@@ -1345,19 +1503,36 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* All is done. */
skb->used = 1;
- sk->shutdown |= RCV_SHUTDOWN;
break;
}
- if (copied >= 0 && msg->msg_name)
- tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
- msg->msg_name);
+ if (user_recv) {
+ if (skb_queue_len(&tp->ucopy.prequeue)) {
+ int chunk;
+
+ tp->ucopy.len = copied > 0 ? len : 0;
+
+ tcp_prequeue_process(sk);
+
+ if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+ net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+ len -= chunk;
+ copied += chunk;
+ }
+ }
- if(addr_len)
- *addr_len = tp->af_specific->sockaddr_len;
+ tp->ucopy.task = NULL;
+ tp->ucopy.len = 0;
+ }
+
+ /* According to UNIX98, msg_name/msg_namelen are ignored
+ * on connected socket. I was just happy when found this 8) --ANK
+ */
/* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk, copied);
+
+ TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
@@ -1365,24 +1540,16 @@ out_err:
err = sock_error(sk);
out:
+ TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
recv_urg:
- err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+ err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
goto out;
}
/*
- * Check whether to renew the timer.
- */
-static inline void tcp_check_fin_timer(struct sock *sk)
-{
- if (sk->state == TCP_FIN_WAIT2)
- tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
-}
-
-/*
* State processing on a close. This implements the state shift for
* sending our FIN frame. Note that we only send a FIN for some
* states. A shutdown() may have already sent the FIN, or we may be
@@ -1405,24 +1572,13 @@ static unsigned char new_state[16] = {
/* TCP_CLOSING */ TCP_CLOSING,
};
-static int tcp_close_state(struct sock *sk, int dead)
+static int tcp_close_state(struct sock *sk)
{
int next = (int) new_state[sk->state];
int ns = (next & TCP_STATE_MASK);
tcp_set_state(sk, ns);
- /* This is a (useful) BSD violating of the RFC. There is a
- * problem with TCP as specified in that the other end could
- * keep a socket open forever with no application left this end.
- * We use a 3 minute timeout (about the same as BSD) then kill
- * our end. If they send after that then tough - BUT: long enough
- * that we won't make the old 4*rto = almost no time - whoops
- * reset mistake.
- */
- if (dead)
- tcp_check_fin_timer(sk);
-
return (next & TCP_ACTION_FIN);
}
@@ -1443,9 +1599,8 @@ void tcp_shutdown(struct sock *sk, int how)
/* If we've already sent a FIN, or it's a closed state, skip this. */
if ((1 << sk->state) &
(TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
-
/* Clear out any half completed packets. FIN if needed. */
- if (tcp_close_state(sk,0))
+ if (tcp_close_state(sk))
tcp_send_fin(sk);
}
}
@@ -1460,40 +1615,6 @@ static inline int closing(struct sock * sk)
return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
}
-/*
- * This routine closes sockets which have been at least partially
- * opened, but not yet accepted. Currently it is only called by
- * tcp_close.
- */
-
-static void tcp_close_pending (struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct open_request *req = tp->syn_wait_queue;
-
- while(req) {
- struct open_request *iter;
-
- if (req->sk)
- tcp_close(req->sk, 0);
-
- iter = req;
- req = req->dl_next;
-
- if (iter->sk) {
- sk->ack_backlog--;
- } else {
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- tp->syn_backlog--;
- }
- (*iter->class->destructor)(iter);
- tcp_openreq_free(iter);
- }
- BUG_TRAP(tp->syn_backlog == 0);
- BUG_TRAP(sk->ack_backlog == 0);
- tcp_synq_init(tp);
-}
-
static __inline__ void tcp_kill_sk_queues(struct sock *sk)
{
/* First the read buffer. */
@@ -1528,6 +1649,14 @@ void tcp_destroy_sock(struct sock *sk)
/* It it has not 0 sk->num, it must be bound */
BUG_TRAP(!sk->num || sk->prev!=NULL);
+#ifdef TCP_DEBUG
+ if (sk->zapped) {
+ printk("TCP: double destroy sk=%p\n", sk);
+ sock_hold(sk);
+ }
+ sk->zapped = 1;
+#endif
+
sk->prot->destroy(sk);
tcp_kill_sk_queues(sk);
@@ -1538,6 +1667,7 @@ void tcp_destroy_sock(struct sock *sk)
}
#endif
+ atomic_dec(&tcp_orphan_count);
sock_put(sk);
}
@@ -1547,17 +1677,17 @@ void tcp_close(struct sock *sk, long timeout)
int data_was_unread = 0;
lock_sock(sk);
+ sk->shutdown = SHUTDOWN_MASK;
+
if(sk->state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
/* Special case. */
- tcp_close_pending(sk);
+ tcp_listen_stop(sk);
goto adjudge_to_death;
}
- sk->shutdown = SHUTDOWN_MASK;
-
/* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
@@ -1581,10 +1711,35 @@ void tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_KERNEL);
- } else if (tcp_close_state(sk,1)) {
+ } else if (sk->linger && sk->lingertime==0) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->prot->disconnect(sk, 0);
+ } else if (tcp_close_state(sk)) {
/* We FIN if the application ate all the data before
* zapping the connection.
*/
+
+ /* RED-PEN. Formally speaking, we have broken TCP state
+ * machine. State transitions:
+ *
+ * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+ *
+ * are legal only when FIN has been sent (i.e. in window),
+ * rather than queued out of window. Purists blame.
+ *
+ * F.e. "RFC state" is ESTABLISHED,
+ * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+ *
+ * The visible declinations are that sometimes
+ * we enter time-wait state, when it is not required really
+ * (harmless), do not send active resets, when they are
+ * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+ * they look as CLOSING or LAST_ACK for Linux)
+ * Probably, I missed some more holelets.
+ * --ANK
+ */
tcp_send_fin(sk);
}
@@ -1594,26 +1749,19 @@ void tcp_close(struct sock *sk, long timeout)
add_wait_queue(sk->sleep, &wait);
- while (1) {
+ do {
set_current_state(TASK_INTERRUPTIBLE);
if (!closing(sk))
break;
release_sock(sk);
timeout = schedule_timeout(timeout);
lock_sock(sk);
- if (!signal_pending(tsk) || timeout)
- break;
- }
+ } while (!signal_pending(tsk) && timeout);
tsk->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
}
- /* Now that the socket is dead, if we are in the FIN_WAIT2 state
- * we may need to set up a timer.
- */
- tcp_check_fin_timer(sk);
-
adjudge_to_death:
/* It is the last release_sock in its life. It will remove backlog. */
release_sock(sk);
@@ -1627,23 +1775,67 @@ adjudge_to_death:
BUG_TRAP(sk->lock.users==0);
sock_hold(sk);
+ sock_orphan(sk);
+
+ /* This is a (useful) BSD violating of the RFC. There is a
+ * problem with TCP as specified in that the other end could
+ * keep a socket open forever with no application left this end.
+ * We use a 3 minute timeout (about the same as BSD) then kill
+ * our end. If they send after that then tough - BUT: long enough
+ * that we won't make the old 4*rto = almost no time - whoops
+ * reset mistake.
+ *
+ * Nope, it was not mistake. It is really desired behaviour
+ * f.e. on http servers, when such sockets are useless, but
+ * consume significant resources. Let's do it with special
+ * linger2 option. --ANK
+ */
+
+ if (sk->state == TCP_FIN_WAIT2) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ if (tp->linger2 < 0) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ } else {
+ int tmo = tcp_fin_time(tp);
- /* Announce socket dead, detach it from wait queue and inode. */
- write_lock_irq(&sk->callback_lock);
- sk->dead = 1;
- sk->socket = NULL;
- sk->sleep = NULL;
- write_unlock_irq(&sk->callback_lock);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+ } else {
+ atomic_inc(&tcp_orphan_count);
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ }
+ if (sk->state != TCP_CLOSE &&
+ atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
+ if (net_ratelimit())
+ printk(KERN_INFO "TCP: too many of orphaned sockets\n");
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ }
+ atomic_inc(&tcp_orphan_count);
if (sk->state == TCP_CLOSE)
tcp_destroy_sock(sk);
/* Otherwise, socket is reprieved until protocol close. */
+out:
bh_unlock_sock(sk);
local_bh_enable();
sock_put(sk);
}
+/* These states need RST on ABORT according to RFC793 */
+
+extern __inline__ int tcp_need_reset(int state)
+{
+ return ((1 << state) &
+ (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
+ TCPF_FIN_WAIT2|TCPF_SYN_RECV));
+}
+
int tcp_disconnect(struct sock *sk, int flags)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -1656,9 +1848,14 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
- tcp_close_pending(sk);
- } else if (tcp_connected(old_state)) {
- tcp_send_active_reset(sk, GFP_KERNEL);
+ tcp_listen_stop(sk);
+ } else if (tcp_need_reset(old_state) ||
+ (tp->snd_nxt != tp->write_seq &&
+ (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
+ /* The last check adjusts for discrepance of Linux wrt. RFC
+ * states
+ */
+ tcp_send_active_reset(sk, gfp_any());
sk->err = ECONNRESET;
} else if (old_state == TCP_SYN_SENT)
sk->err = ECONNRESET;
@@ -1677,26 +1874,25 @@ int tcp_disconnect(struct sock *sk, int flags)
memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
#endif
- sk->zapped = 0;
sk->shutdown = 0;
sk->done = 0;
sk->write_space = tcp_write_space;
tp->srtt = 0;
-#ifdef CONFIG_TCP_TW_RECYCLE
- if ((tp->write_seq += 2) == 0)
- tp->write_seq = 1;
-#else
- tp->write_seq = 0;
-#endif
- tp->ato = 0;
+ if (sysctl_tcp_tw_recycle) {
+ if ((tp->write_seq += 2) == 0)
+ tp->write_seq = 1;
+ } else {
+ tp->write_seq = 0;
+ }
tp->backoff = 0;
tp->snd_cwnd = 2;
tp->probes_out = 0;
+ tp->packets_out = 0;
tp->high_seq = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
tp->dup_acks = 0;
- tp->delayed_acks = 0;
+ tcp_delack_init(tp);
tp->send_head = tp->retrans_head = NULL;
tp->saw_tstamp = 0;
__sk_dst_reset(sk);
@@ -1712,11 +1908,10 @@ int tcp_disconnect(struct sock *sk, int flags)
* conditions. This must be called with the socket locked,
* and without the kernel lock held.
*/
-static struct open_request * wait_for_connect(struct sock * sk,
- struct open_request **pprev)
+static int wait_for_connect(struct sock * sk, long timeo)
{
DECLARE_WAITQUEUE(wait, current);
- struct open_request *req;
+ int err;
/*
* True wake-one mechanism for incoming connections: only
@@ -1736,17 +1931,25 @@ static struct open_request * wait_for_connect(struct sock * sk,
for (;;) {
current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
release_sock(sk);
- schedule();
+ if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
+ timeo = schedule_timeout(timeo);
lock_sock(sk);
- req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
- if (req)
+ err = 0;
+ if (sk->tp_pinfo.af_tcp.accept_queue)
+ break;
+ err = -EINVAL;
+ if (sk->state != TCP_LISTEN)
break;
+ err = -ERESTARTSYS;
if (signal_pending(current))
break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
}
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
- return req;
+ return err;
}
/*
@@ -1758,9 +1961,10 @@ static struct open_request * wait_for_connect(struct sock * sk,
struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct open_request *req, *prev;
+ struct open_request *req;
struct sock *newsk;
int error;
+ long timeo;
lock_sock(sk);
@@ -1771,25 +1975,27 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
if (sk->state != TCP_LISTEN)
goto out;
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
/* Find already established connection */
- req = tcp_find_established(tp, &prev);
- if (!req) {
+ if (!tp->accept_queue) {
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
- if (flags & O_NONBLOCK)
+ if (!timeo)
goto out;
- error = -ERESTARTSYS;
- req = wait_for_connect(sk, &prev);
- if (!req)
+ error = wait_for_connect(sk, timeo);
+ if (error)
goto out;
}
- tcp_synq_unlink(tp, req, prev);
- newsk = req->sk;
- req->class->destructor(req);
- tcp_openreq_free(req);
- sk->ack_backlog--;
+ req = tp->accept_queue;
+ tp->accept_queue = req->dl_next;
+
+ newsk = req->sk;
+ tcp_acceptq_removed(sk);
+ tcp_openreq_fastfree(req);
+ BUG_TRAP(newsk->state != TCP_SYN_RECV);
release_sock(sk);
return newsk;
@@ -1828,7 +2034,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
* the point when this call is done we typically don't yet know
* which interface is going to be used
*/
- if(val < 1 || val > MAX_WINDOW) {
+ if(val < 8 || val > MAX_TCP_WINDOW) {
err = -EINVAL;
break;
}
@@ -1839,11 +2045,11 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
/* You cannot try to use this and TCP_CORK in
* tandem, so let the user know.
*/
- if (sk->nonagle == 2) {
+ if (tp->nonagle == 2) {
err = -EINVAL;
break;
}
- sk->nonagle = (val == 0) ? 0 : 1;
+ tp->nonagle = (val == 0) ? 0 : 1;
break;
case TCP_CORK:
@@ -1858,14 +2064,14 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
* You cannot try to use TCP_NODELAY and this mechanism
* at the same time, so let the user know.
*/
- if (sk->nonagle == 1) {
+ if (tp->nonagle == 1) {
err = -EINVAL;
break;
}
if (val != 0) {
- sk->nonagle = 2;
+ tp->nonagle = 2;
} else {
- sk->nonagle = 0;
+ tp->nonagle = 0;
tcp_push_pending_frames(sk, tp);
}
@@ -1905,6 +2111,38 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
tp->syn_retries = val;
break;
+ case TCP_LINGER2:
+ if (val < 0)
+ tp->linger2 = -1;
+ else if (val > sysctl_tcp_fin_timeout/HZ)
+ tp->linger2 = 0;
+ else
+ tp->linger2 = val*HZ;
+ break;
+
+ case TCP_DEFER_ACCEPT:
+ tp->defer_accept = 0;
+ if (val > 0) {
+ /* Translate value in seconds to number of retransmits */
+ while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
+ tp->defer_accept++;
+ tp->defer_accept++;
+ }
+ break;
+
+ case TCP_WINDOW_CLAMP:
+ if (val==0) {
+ if (sk->state != TCP_CLOSE) {
+ err = -EINVAL;
+ break;
+ }
+ tp->window_clamp = 0;
+ } else {
+ tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
+ SOCK_MIN_SNDBUF : val;
+ }
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -1930,37 +2168,38 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
switch(optname) {
case TCP_MAXSEG:
- val = tp->user_mss;
+ val = tp->mss_cache;
+ if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
+ val = tp->user_mss;
break;
case TCP_NODELAY:
- val = (sk->nonagle == 1);
+ val = (tp->nonagle == 1);
break;
case TCP_CORK:
- val = (sk->nonagle == 2);
+ val = (tp->nonagle == 2);
break;
case TCP_KEEPIDLE:
- if (tp->keepalive_time)
- val = tp->keepalive_time / HZ;
- else
- val = sysctl_tcp_keepalive_time / HZ;
+ val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
break;
case TCP_KEEPINTVL:
- if (tp->keepalive_intvl)
- val = tp->keepalive_intvl / HZ;
- else
- val = sysctl_tcp_keepalive_intvl / HZ;
+ val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
break;
case TCP_KEEPCNT:
- if (tp->keepalive_probes)
- val = tp->keepalive_probes;
- else
- val = sysctl_tcp_keepalive_probes;
+ val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
break;
case TCP_SYNCNT:
- if (tp->syn_retries)
- val = tp->syn_retries;
- else
- val = sysctl_tcp_syn_retries;
+ val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ break;
+ case TCP_LINGER2:
+ val = tp->linger2;
+ if (val > 0)
+ val = (val ? : sysctl_tcp_fin_timeout)/HZ;
+ break;
+ case TCP_DEFER_ACCEPT:
+ val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
+ break;
+ case TCP_WINDOW_CLAMP:
+ val = tp->window_clamp;
break;
default:
return -ENOPROTOOPT;
@@ -2049,11 +2288,20 @@ void __init tcp_init(void)
tcp_bhash[i].chain = NULL;
}
+ /* Try to be a bit smarter and adjust defaults depending
+ * on available memory.
+ */
if (order > 4) {
sysctl_local_port_range[0] = 32768;
sysctl_local_port_range[1] = 61000;
+ sysctl_tcp_max_tw_buckets = 180000;
+ sysctl_tcp_max_orphans = 4096<<(order-4);
+ sysctl_max_syn_backlog = 1024;
} else if (order < 3) {
sysctl_local_port_range[0] = 1024*(3-order);
+ sysctl_tcp_max_tw_buckets >>= (3-order);
+ sysctl_tcp_max_orphans >>= (3-order);
+ sysctl_max_syn_backlog = 128;
}
tcp_port_rover = sysctl_local_port_range[0] - 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b4ae64a2..d61a5df02 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.183 2000/01/24 18:40:33 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -70,9 +70,6 @@
#define SYNC_INIT 1
#endif
-extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_keepalive_time;
-
/* These are on by default so the code paths get tested.
* For the final 2.2 this may be undone at our discretion. -DaveM
*/
@@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1;
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;
-int sysctl_tcp_tw_recycle;
+int sysctl_tcp_tw_recycle = 1;
+int sysctl_tcp_abort_on_overflow = 0;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
static int prune_queue(struct sock *sk);
+/*
+ * Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ *
+ * The constant 536 hasn't any good meaning. In IPv4 world
+ * MTU may be smaller, though it contradicts to RFC1122, which
+ * states that MSS must be at least 536.
+ * We use the constant to do not ACK each second
+ * packet in a stream of tiny size packets.
+ * It means that super-low mtu links will be aggressively delacked.
+ * Seems, it is even good. If they have so low mtu, they are weirdly
+ * slow.
+ *
+ * AK: BTW it may be useful to add an option to lock the rcv_mss.
+ * this way the beowulf people wouldn't need ugly patches to get the
+ * ack frequencies they want and it would be an elegant way to tune delack.
+ */
+static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ unsigned int len, lss;
+
+ lss = tp->ack.last_seg_size;
+ tp->ack.last_seg_size = 0;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb->len;
+ if (len >= tp->ack.rcv_mss) {
+ tp->ack.rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len = skb->tail - skb->h.raw;
+ if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tp->tcp_header_len;
+ if (len == lss)
+ tp->ack.rcv_mss = len;
+ tp->ack.last_seg_size = len;
+ }
+
+#if 0
+ /* Tiny-grams with PSH set artifically deflate our
+ * ato measurement.
+ *
+ * Mmm... I copied this test from tcp_remember_ack(), but
+ * I did not understand this. Is it to speedup nagling sender?
+ * It does not because classic (non-Minshall) sender nagles
+ * guided by not-acked frames not depending on size.
+ * And it does not help NODELAY sender, because latency
+ * is too high in any case. The only result is timer trashing
+ * and redundant ACKs. Grr... Seems, I missed something. --ANK
+ *
+ * Let me to comment out this yet... TCP should work
+ * perfectly without this. --ANK
+ */
+ if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh)
+ tp->ack.ato = TCP_ATO_MIN;
+#endif
+ }
+}
+
+
+static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp)
+{
+ unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss);
+
+ tp->ack.quick = max(min(quickacks, 127), 1);
+
+ if (!tp->tstamp_ok && tp->ack.quick>2) {
+ /* Quick ACKs are _dangerous_, if RTTM is not used.
+ * See comment in tcp_init_metrics(). We still help
+ * them to overcome the most difficult, initial
+ * phase of slow start.
+ */
+ tp->ack.quick = 2;
+ }
+}
+
+/* Send ACKs quickly, if "quick" count is not ehausted
+ * and the session is not interactive.
+ */
+
+static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp)
+{
+ return (tp->ack.quick && !tp->ack.pingpong);
+}
+
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
@@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk);
* each ACK we send, he increments snd_cwnd and transmits more of his
* queue. -DaveM
*/
-static void tcp_delack_estimator(struct tcp_opt *tp)
+static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb)
{
- if(tp->ato == 0) {
- tp->lrcvtime = tcp_time_stamp;
+ u32 now;
- /* Help sender leave slow start quickly,
- * and also makes sure we do not take this
- * branch ever again for this connection.
+ tcp_measure_rcv_mss(tp, skb);
+
+ tp->ack.pending = 1;
+
+ now = tcp_time_stamp;
+
+ if (!tp->ack.ato) {
+ /* The _first_ data packet received, initialize
+ * delayed ACK engine.
*/
- tp->ato = 1;
+
+ /* Help sender leave slow start quickly. */
tcp_enter_quickack_mode(tp);
+
+ /* Pingpong is off, session is not interactive by default */
+ tp->ack.pingpong = 0;
+
+ /* ATO is minimal */
+ tp->ack.ato = TCP_ATO_MIN;
} else {
- int m = tcp_time_stamp - tp->lrcvtime;
-
- tp->lrcvtime = tcp_time_stamp;
- if(m <= 0)
- m = 1;
- if(m > tp->rto)
- tp->ato = tp->rto;
- else {
- /* This funny shift makes sure we
- * clear the "quick ack mode" bit.
+ int m = now - tp->ack.lrcvtime;
+
+ if (m > TCP_ATO_MAX/2) {
+ /* Do not touch ATO, if interval is out of bounds.
+ * It will be deflated by delack timer, if our peer
+ * really sends too rarely.
*/
- tp->ato = ((tp->ato << 1) >> 2) + m;
+ if (m > tp->rto) {
+ /* Too long gap. Apparently sender falled to
+ * restart window, so that we send ACKs quickly.
+ */
+ tcp_enter_quickack_mode(tp);
+ }
+ } else {
+ if (m <= 0)
+ m = TCP_ATO_MIN/2;
+ tp->ack.ato = (tp->ack.ato >> 1) + m;
}
}
+ tp->ack.lrcvtime = now;
}
-/*
- * Remember to send an ACK later.
- */
-static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
- struct sk_buff *skb)
-{
- tp->delayed_acks++;
-
- /* Tiny-grams with PSH set artifically deflate our
- * ato measurement, but with a lower bound.
- */
- if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
- /* Preserve the quickack state. */
- if((tp->ato & 0x7fffffff) > HZ/50)
- tp->ato = ((tp->ato & 0x80000000) |
- (HZ/50));
- }
-}
-
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
@@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp)
*/
static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
{
- if (tp->rto > 120*HZ)
- tp->rto = 120*HZ;
- if (tp->rto < HZ/5)
- tp->rto = HZ/5;
+ if (tp->rto < TCP_RTO_MIN)
+ tp->rto = TCP_RTO_MIN;
+ else if (tp->rto > TCP_RTO_MAX)
+ tp->rto = TCP_RTO_MAX;
}
/* Save metrics learned by this TCP session.
@@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct dst_entry *dst = __sk_dst_get(sk);
- if (dst) {
+ dst_confirm(dst);
+
+ if (dst && (dst->flags&DST_HOST)) {
int m;
if (tp->backoff || !tp->srtt) {
@@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk)
return;
}
- dst_confirm(dst);
-
m = dst->rtt - tp->srtt;
/* If newly calculated rtt larger than stored one,
@@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk)
dst_confirm(dst);
+ if (dst->mxlock&(1<<RTAX_CWND))
+ tp->snd_cwnd_clamp = dst->cwnd;
+ if (dst->ssthresh) {
+ tp->snd_ssthresh = dst->ssthresh;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+
if (dst->rtt == 0)
goto reset;
- if (!tp->srtt || !tp->saw_tstamp)
+ if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
goto reset;
/* Initial rtt is determined from SYN,SYN-ACK.
@@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev = dst->rttvar;
tcp_set_rto(tp);
tcp_bound_rto(tp);
-
- if (dst->mxlock&(1<<RTAX_CWND))
- tp->snd_cwnd_clamp = dst->cwnd;
- if (dst->ssthresh) {
- tp->snd_ssthresh = dst->ssthresh;
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- }
+ if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
+ goto reset;
+ tp->snd_cwnd = tcp_init_cwnd(tp);
return;
@@ -357,9 +454,6 @@ reset:
}
}
-#define PAWS_24DAYS (60 * 60 * 24 * 24)
-
-
/* WARNING: this must not be called if tp->saw_tstamp was false. */
extern __inline__ void
tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
@@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
*/
if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
- xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
+ xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) {
tp->ts_recent = tp->rcv_tsval;
tp->ts_recent_stamp = xtime.tv_sec;
}
@@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
{
return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
- xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
+ xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
@@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+#ifdef TCP_FORMAL_WINDOW
+ u32 rcv_wnd = tcp_receive_window(tp);
+#else
+ u32 rcv_wnd = tp->rcv_wnd;
+#endif
- if (tp->rcv_wnd &&
+ if (rcv_wnd &&
after(end_seq, tp->rcv_nxt) &&
before(seq, end_window))
return 1;
@@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
/* This functions checks to see if the tcp header is actually acceptable. */
extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
+#ifdef TCP_FORMAL_WINDOW
+ u32 rcv_wnd = tcp_receive_window(tp);
+#else
+ u32 rcv_wnd = tp->rcv_wnd;
+#endif
if (seq == tp->rcv_nxt)
- return (tp->rcv_wnd || (end_seq == seq));
+ return (rcv_wnd || (end_seq == seq));
return __tcp_sequence(tp, seq, end_seq);
}
@@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
/* When we get a reset we do this. */
static void tcp_reset(struct sock *sk)
{
- sk->zapped = 1;
-
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->state) {
case TCP_SYN_SENT:
@@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk)
return;
default:
sk->err = ECONNRESET;
- };
- tcp_set_state(sk, TCP_CLOSE);
- tcp_clear_xmit_timers(sk);
+ }
+
tcp_done(sk);
}
@@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
tp->dup_acks++;
if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- tp->snd_cwnd = (tp->snd_ssthresh + 3);
- tp->high_seq = tp->snd_nxt;
+ __tcp_enter_cong_avoid(tp);
+ /* ... and account for 3 ACKs, which are
+ * already received to this time.
+ */
+ tp->snd_cwnd += 3;
+
if(!tp->fackets_out)
tcp_retransmit_skb(sk,
skb_peek(&sk->write_queue));
else
tcp_fack_retransmit(sk);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
} else if (++tp->dup_acks > 3) {
/* 2. Each time another duplicate ACK arrives, increment
@@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
if (ack != tp->snd_una && before(ack, tp->high_seq)) {
tcp_retransmit_skb(sk,
skb_peek(&sk->write_queue));
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
} else {
/* FACK style, fill any remaining holes in
@@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
{
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* In "safe" area, increase. */
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
} else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
@@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /* Our probe was answered. */
- tp->probes_out = 0;
-
/* Was it a usable window open? */
- /* should always be non-null */
- if (tp->send_head != NULL &&
- !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
- tp->backoff = 0;
- tp->pending = 0;
- tcp_clear_xmit_timer(sk, TIME_PROBE0);
- } else {
- tcp_reset_xmit_timer(sk, TIME_PROBE0,
- min(tp->rto << tp->backoff, 120*HZ));
+ if (tp->send_head != NULL) {
+ if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) {
+ tp->backoff = 0;
+ tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+ /* If packets_out==0, socket must be waked up by
+ * subsequent tcp_data_snd_check(). This function is
+ * not for random using!
+ */
+ } else if (!tp->packets_out) {
+ tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ }
}
}
-
+
/* Should we open up the congestion window? */
static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
{
@@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
struct sk_buff *skb = skb_peek(&sk->write_queue);
+#ifdef TCP_DEBUG
+ /* It occured in 2.3, because of racy timers. Namely,
+ * retransmit timer did not check packets_out and retransmitted
+ * send_head sometimes and, hence, messed all the write_queue.
+ * Now it is impossible, I bet. --ANK
+ */
+ if (skb == NULL) {
+ printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state);
+ return;
+ }
+#endif
+
/* Some data was ACK'd, if still retransmitting (due to a
* timeout), resend more of the retransmit queue. The
* congestion window is handled properly by that code.
*/
if (tp->retransmits) {
tcp_xmit_retransmit_queue(sk);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
} else {
__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
if ((__s32)when < 0)
when = 1;
- tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
}
}
@@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
u32 seq = 0;
u32 seq_rtt = 0;
- if(sk->zapped)
- return(1); /* Dead, can't ack any more so why bother */
-
- if (tp->pending == TIME_KEEPOPEN)
- tp->probes_out = 0;
-
- tp->rcv_tstamp = tcp_time_stamp;
+ if(sk->state == TCP_CLOSE)
+ return 1; /* Dead, can't ack any more so why bother */
/* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
@@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
goto uninteresting_ack;
/* If there is data set flag 1 */
- if (len != th->doff*4) {
+ if (len != th->doff*4)
flag |= FLAG_DATA;
- tcp_delack_estimator(tp);
- }
/* Update our send window. */
@@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
flag |= FLAG_WIN_UPDATE;
- tp->snd_wnd = nwin;
+ if (tp->snd_wnd != nwin) {
+ tp->snd_wnd = nwin;
+
+ /* Note, it is the only place, where
+ * fast path is recovered for sending TCP.
+ */
+ if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+ tcp_receive_window(tp) &&
+#endif
+ !tp->urg_data)
+ tcp_fast_path_on(tp);
+
+ if (nwin > tp->max_window) {
+ tp->max_window = nwin;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ }
+ }
tp->snd_wl1 = ack_seq;
tp->snd_wl2 = ack;
-
- if (nwin > tp->max_window)
- tp->max_window = nwin;
}
}
+ /* BEWARE! From this place and until return from this function
+ * snd_nxt and snd_wnd are out of sync. All the routines, called
+ * from here must get "ack" as argument or they should not depend
+ * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
+ */
+
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->err_soft = 0;
+ tp->probes_out = 0;
+ tp->rcv_tstamp = tcp_time_stamp;
+
+ /* See if we can take anything off of the retransmit queue. */
+ flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
- if (tp->pending == TIME_PROBE0)
+ if (tcp_timer_is_set(sk, TCP_TIME_PROBE0))
tcp_ack_probe(sk, ack);
- /* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
-
/* We must do this here, before code below clears out important
* state contained in tp->fackets_out and tp->retransmits. -DaveM
*/
@@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
if (flag & FLAG_DATA_ACKED)
tcp_ack_packets_out(sk, tp);
} else {
- tcp_clear_xmit_timer(sk, TIME_RETRANS);
+ tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
}
flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
@@ -1074,9 +1204,42 @@ uninteresting_ack:
return 0;
}
+int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+ if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+ return 0;
+ if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+ return 0;
+
+ /* RST segments are not recommended to carry timestamp,
+ and, if they do, it is recommended to ignore PAWS because
+ "their cleanup function should take precedence over timestamps."
+ Certainly, it is mistake. It is necessary to understand the reasons
+ of this constraint to relax it: if peer reboots, clock may go
+ out-of-sync and half-open connections will not be reset.
+ Actually, the problem would be not existing if all
+ the implementations followed draft about maintaining clock
+ via reboots. Linux-2.2 DOES NOT!
+
+ However, we can relax time bounds for RST segments to MSL.
+ */
+ if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+ return 0;
+ return 1;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return 1;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return 1;
+ return (seq == e_win && seq == end_seq);
+}
+
/* New-style handling of TIME_WAIT sockets. */
-/* Must be called only from BH context. */
+/* Must be called with locally disabled BHs. */
void tcp_timewait_kill(struct tcp_tw_bucket *tw)
{
struct tcp_ehash_bucket *ehead;
@@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
tcp_tw_put(tw);
}
-/* We come here as a special case from the AF specific TCP input processing,
- * and the SKB has no owner. Essentially handling this is very simple,
- * we just keep silently eating rx'd packets until none show up for the
- * entire timeout period. The only special cases are for BSD TIME_WAIT
- * reconnects and SYN/RST bits being set in the TCP header.
- */
-
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* The algorithm below is based on FORMAL INTERPRETATION of RFCs.
* When you compare it to RFCs, please, read section SEGMENT ARRIVES
* from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc. --ANK
*/
enum tcp_tw_status
tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
@@ -1157,7 +1319,75 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
struct tcp_opt tp;
int paws_reject = 0;
- /* RFC 1122:
+ tp.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
+ tcp_parse_options(NULL, th, &tp, 0);
+
+ if (tp.saw_tstamp) {
+ tp.ts_recent = tw->ts_recent;
+ tp.ts_recent_stamp = tw->ts_recent_stamp;
+ paws_reject = tcp_paws_check(&tp, th->rst);
+ }
+ }
+
+ if (tw->substate == TCP_FIN_WAIT2) {
+ /* Just repeat all the checks of tcp_rcv_state_process() */
+
+ /* Out of window, send ACK */
+ if (paws_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
+ return TCP_TW_ACK;
+
+ if (th->rst)
+ goto kill;
+
+ if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq)
+ goto kill_with_rst;
+
+ /* Dup ACK? */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) {
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* New data or FIN. If new data arrive after half-duplex close,
+ * reset.
+ */
+ if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
+kill_with_rst:
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ return TCP_TW_RST;
+ }
+
+ /* FIN arrived, enter true time-wait state. */
+ tw->substate = TCP_TIME_WAIT;
+ tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tp.saw_tstamp) {
+ tw->ts_recent_stamp = xtime.tv_sec;
+ tw->ts_recent = tp.rcv_tsval;
+ }
+
+ /* I am shamed, but failed to make it more elegant.
+ * Yes, it is direct reference to IP, which is impossible
+ * to generalize to IPv6. Taking into account that IPv6
+ * do not undertsnad recycling in any case, it not
+ * a big problem in practice. --ANK */
+ if (tw->family == AF_INET &&
+ sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
+ tcp_v4_tw_remember_stamp(tw))
+ tcp_tw_schedule(tw, tw->timeout);
+ else
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ return TCP_TW_ACK;
+ }
+
+ /*
+ * Now real TIME-WAIT state.
+ *
+ * RFC 1122:
* "When a connection is [...] on TIME-WAIT state [...]
* [a TCP] MAY accept a new SYN from the remote TCP to
* reopen the connection directly, if it:
@@ -1171,47 +1401,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* to be an old duplicate".
*/
- tp.saw_tstamp = 0;
- if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
- tcp_parse_options(NULL, th, &tp, 0);
-
- paws_reject = tp.saw_tstamp &&
- ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
- xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
- }
-
if (!paws_reject &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
/* In window segment, it may be only reset or bare ack. */
if (th->rst) {
-#ifdef CONFIG_TCP_TW_RECYCLE
- /* When recycling, always follow rfc1337,
- * but mark bucket as ready to recycling immediately.
- */
- if (sysctl_tcp_tw_recycle) {
- /* May kill it now. */
- tw->rto = 0;
- tw->ttd = jiffies;
- } else
-#endif
/* This is TIME_WAIT assasination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
- if(sysctl_tcp_rfc1337 == 0) {
+ if (sysctl_tcp_rfc1337 == 0) {
+kill:
tcp_tw_deschedule(tw);
tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
}
- } else {
- tcp_tw_reschedule(tw);
}
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
if (tp.saw_tstamp) {
tw->ts_recent = tp.rcv_tsval;
tw->ts_recent_stamp = xtime.tv_sec;
}
+
tcp_tw_put(tw);
return TCP_TW_SUCCESS;
}
@@ -1235,7 +1449,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
if (th->syn && !th->rst && !th->ack && !paws_reject &&
(after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
- (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
+ (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
u32 isn = tw->snd_nxt + 2;
if (isn == 0)
isn++;
@@ -1243,20 +1457,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
return TCP_TW_SYN;
}
+ if (paws_reject)
+ NET_INC_STATS_BH(PAWSEstabRejected);
+
if(!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
-
- If it is ACKless SYN it may be both old duplicate
- and new good SYN with random sequence number <rcv_nxt.
- Do not reschedule in the last case.
+ *
+ * If it is ACKless SYN it may be both old duplicate
+ * and new good SYN with random sequence number <rcv_nxt.
+ * Do not reschedule in the last case.
*/
- if (paws_reject || th->ack) {
- tcp_tw_reschedule(tw);
-#ifdef CONFIG_TCP_TW_RECYCLE
- tw->rto = min(120*HZ, tw->rto<<1);
- tw->ttd = jiffies + tw->rto;
-#endif
- }
+ if (paws_reject || th->ack)
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
@@ -1267,8 +1479,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
return TCP_TW_SUCCESS;
}
-/* Enter the time wait state. This is always called from BH
- * context. Essentially we whip up a timewait bucket, copy the
+/* Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
* relevant info into it from the SK, and mess with hash chains
* and list linkage.
*/
@@ -1286,6 +1498,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
+ sock_prot_dec_use(sk->prot);
}
/* Step 2: Hash TW into TIMEWAIT half of established hash table. */
@@ -1312,41 +1525,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
tw->tb->owners = (struct sock*)tw;
tw->bind_pprev = &tw->tb->owners;
spin_unlock(&bhead->lock);
-
- /* Step 4: Un-charge protocol socket in-use count. */
- sock_prot_dec_use(sk->prot);
}
/*
- * Move a socket to time-wait.
+ * Move a socket to time-wait or dead fin-wait-2 state.
*/
-void tcp_time_wait(struct sock *sk)
+void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct tcp_tw_bucket *tw;
+ struct tcp_tw_bucket *tw = NULL;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int recycle_ok = 0;
+
+ if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
+ recycle_ok = tp->af_specific->remember_stamp(sk);
+
+ if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+ tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
- tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
if(tw != NULL) {
+ int rto = (tp->rto<<2) - (tp->rto>>1);
+
/* Give us an identity. */
tw->daddr = sk->daddr;
tw->rcv_saddr = sk->rcv_saddr;
tw->bound_dev_if= sk->bound_dev_if;
tw->num = sk->num;
tw->state = TCP_TIME_WAIT;
+ tw->substate = state;
tw->sport = sk->sport;
tw->dport = sk->dport;
tw->family = sk->family;
tw->reuse = sk->reuse;
- tw->hashent = sk->hashent;
- tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
- tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
- tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent;
- tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
-#ifdef CONFIG_TCP_TW_RECYCLE
- tw->rto = sk->tp_pinfo.af_tcp.rto;
- tw->ttd = jiffies + 2*tw->rto;
-#endif
+ tw->rcv_wscale = tp->rcv_wscale;
atomic_set(&tw->refcnt, 0);
+ tw->hashent = sk->hashent;
+ tw->rcv_nxt = tp->rcv_nxt;
+ tw->snd_nxt = tp->snd_nxt;
+ tw->rcv_wnd = tcp_receive_window(tp);
+ tw->syn_seq = tp->syn_seq;
+ tw->ts_recent = tp->ts_recent;
+ tw->ts_recent_stamp= tp->ts_recent_stamp;
+ tw->pprev_death = NULL;
+
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if(tw->family == PF_INET6) {
memcpy(&tw->v6_daddr,
@@ -1361,22 +1582,28 @@ void tcp_time_wait(struct sock *sk)
__tcp_tw_hashdance(sk, tw);
/* Get the TIME_WAIT timeout firing. */
- tcp_tw_schedule(tw);
+ if (timeo < rto)
+ timeo = rto;
- /* CLOSE the SK. */
- if(sk->state == TCP_ESTABLISHED)
- tcp_statistics[smp_processor_id()*2].TcpCurrEstab--;
- sk->state = TCP_CLOSE;
+ if (recycle_ok) {
+ tw->timeout = rto;
+ } else {
+ tw->timeout = TCP_TIMEWAIT_LEN;
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
+ }
+
+ tcp_tw_schedule(tw, timeo);
} else {
- /* Sorry, we're out of memory, just CLOSE this
+ /* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- tcp_set_state(sk, TCP_CLOSE);
+ if (net_ratelimit())
+ printk(KERN_INFO "TCP: time wait bucket table overflow\n");
}
tcp_update_metrics(sk);
- tcp_clear_xmit_timers(sk);
tcp_done(sk);
}
@@ -1397,10 +1624,13 @@ void tcp_time_wait(struct sock *sk)
static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
- sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
tcp_send_ack(sk);
+ sk->shutdown |= RCV_SHUTDOWN;
+
switch(sk->state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
@@ -1427,7 +1657,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
- tcp_time_wait(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
@@ -1435,9 +1665,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
*/
printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
break;
- }
+ };
+
+ /* It _is_ possible, that we have something out-of-order _after_ FIN.
+ * Probably, we should reset in this case. For now drop them.
+ */
+ __skb_queue_purge(&tp->out_of_order_queue);
+ if (tp->sack_ok)
+ tp->num_sacks = 0;
+
if (!sk->dead) {
- wake_up_interruptible(sk->sleep);
+ sk->state_change(sk);
sock_wake_async(sk->socket, 1, POLL_HUP);
}
}
@@ -1622,6 +1860,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct
sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -1658,6 +1897,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff *skb1;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int eaten = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
@@ -1665,33 +1905,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
/* Ok. In sequence. */
- queue_and_out:
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ tp->ucopy.len &&
+ sk->lock.users &&
+ !tp->urg_data) {
+ int chunk = min(skb->len, tp->ucopy.len);
+
+ local_bh_enable();
+ if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
+ sk->err = EFAULT;
+ sk->error_report(sk);
+ }
+ local_bh_disable();
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ eaten = (chunk == skb->len && !skb->h.th->fin);
+ }
+
+ if (!eaten) {
+queue_and_out:
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->receive_queue, skb);
+ }
dst_confirm(sk->dst_cache);
- __skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- if(skb->h.th->fin) {
+ if(skb->len)
+ tcp_event_data_recv(tp, skb);
+ if(skb->h.th->fin)
tcp_fin(skb, sk, skb->h.th);
- } else {
- tcp_remember_ack(tp, skb->h.th, skb);
- }
+
/* This may have eaten into a SACK block. */
if(tp->sack_ok && tp->num_sacks)
tcp_sack_remove_skb(tp, skb);
tcp_ofo_queue(sk);
/* Turn on fast path. */
- if (skb_queue_len(&tp->out_of_order_queue) == 0)
- tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
- ntohl(TCP_FLAG_ACK) |
- tp->snd_wnd);
+ if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+ tcp_receive_window(tp) &&
+#endif
+ !tp->urg_data)
+ tcp_fast_path_on(tp);
+
+ if (eaten)
+ kfree_skb(skb);
+
+ if (!sk->dead) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1, POLL_IN);
+ }
return;
}
-
+
/* An old packet, either a retransmit or some packet got lost. */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
- /* A retransmit, 2nd most common case. Force an imediate ack. */
- SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
+ /* A retransmit, 2nd most common case. Force an imediate ack.
+ *
+ * It is impossible, seq is checked by top level.
+ */
+ NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq));
tcp_enter_quickack_mode(tp);
+ tp->ack.pending = 1;
kfree_skb(skb);
return;
}
@@ -1706,15 +1981,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
}
/* Ok. This is an out_of_order segment, force an ack. */
- tp->delayed_acks++;
- tcp_enter_quickack_mode(tp);
+ tp->ack.pending = 1;
/* Disable header prediction. */
tp->pred_flags = 0;
+
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ skb_set_owner_r(skb, sk);
+
if (skb_peek(&tp->out_of_order_queue) == NULL) {
/* Initial out of order segment, build 1 SACK. */
if(tp->sack_ok) {
@@ -1758,6 +2035,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
}
}
}
+ return;
}
@@ -1767,7 +2045,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
* room, then we will just have to discard the packet.
*/
-static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
+static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
{
struct tcphdr *th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1777,11 +2055,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
skb_trim(skb, len - (th->doff*4));
if (skb->len == 0 && !th->fin)
- return(0);
+ goto drop;
/*
* If our receive queue has grown past its limits shrink it.
- * Make sure to do this before moving snd_nxt, otherwise
+ * Make sure to do this before moving rcv_nxt, otherwise
* data might be acked for that we don't have enough room.
*/
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
@@ -1789,7 +2067,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
/* Still not enough room. That can happen when
* skb->true_size differs significantly from skb->len.
*/
- return 0;
+ goto drop;
}
}
@@ -1799,29 +2077,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
tp->rcv_nxt = tp->copied_seq;
}
+ return;
- /* Above, tcp_data_queue() increments delayed_acks appropriately.
- * Now tell the user we may have some data.
- */
- if (!sk->dead) {
- wake_up_interruptible(sk->sleep);
- sock_wake_async(sk->socket,1, POLL_IN);
- }
- return(1);
+drop:
+ kfree_skb(skb);
}
static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
- tcp_packets_in_flight(tp) < tp->snd_cwnd) {
- /* Put more data onto the wire. */
- tcp_write_xmit(sk);
- } else if (tp->packets_out == 0 && !tp->pending) {
- /* Start probing the receivers window. */
- tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
- }
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+ tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+ tcp_write_xmit(sk))
+ tcp_check_probe_timer(sk, tp);
}
static __inline__ void tcp_data_snd_check(struct sock *sk)
@@ -1832,57 +2101,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk)
__tcp_data_snd_check(sk, skb);
}
-/*
- * Adapt the MSS value used to make delayed ack decision to the
- * real world.
- *
- * The constant 536 hasn't any good meaning. In IPv4 world
- * MTU may be smaller, though it contradicts to RFC1122, which
- * states that MSS must be at least 536.
- * We use the constant to do not ACK each second
- * packet in a stream of tiny size packets.
- * It means that super-low mtu links will be aggressively delacked.
- * Seems, it is even good. If they have so low mtu, they are weirdly
- * slow.
- *
- * AK: BTW it may be useful to add an option to lock the rcv_mss.
- * this way the beowulf people wouldn't need ugly patches to get the
- * ack frequencies they want and it would be an elegant way to tune delack.
- */
-static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- unsigned int len, lss;
-
- lss = tp->last_seg_size;
- tp->last_seg_size = 0;
-
- /* skb->len may jitter because of SACKs, even if peer
- * sends good full-sized frames.
- */
- len = skb->len;
- if (len >= tp->rcv_mss) {
- tp->rcv_mss = len;
- } else {
- /* Otherwise, we make more careful check taking into account,
- * that SACKs block is variable.
- *
- * "len" is invariant segment length, including TCP header.
- */
- len = skb->tail - skb->h.raw;
- if (len >= 536 + sizeof(struct tcphdr)) {
- /* Subtract also invariant (if peer is RFC compliant),
- * tcp header plus fixed timestamp option length.
- * Resulting "len" is MSS free of SACK jitter.
- */
- len -= tp->tcp_header_len;
- if (len == lss)
- tp->rcv_mss = len;
- tp->last_seg_size = len;
- }
- }
-}
-
/*
* Check if sending an ack is needed.
*/
@@ -1904,26 +2122,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
* start in an expediant manner.
*/
- /* Two full frames received or... */
- if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
- /* We will update the window "significantly" or... */
- tcp_raise_window(sk) ||
- /* We entered "quick ACK" mode or... */
+ /* More than one full frame received or... */
+ if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) ||
+ /* We ACK each frame or... */
tcp_in_quickack_mode(tp) ||
- /* We have out of order data */
- (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
+ /* We have out of order data or */
+ (ofo_possible &&
+ skb_peek(&tp->out_of_order_queue) != NULL)) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
- tcp_send_delayed_ack(sk, HZ/2);
+ tcp_send_delayed_ack(sk);
}
}
static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- if (tp->delayed_acks == 0) {
+ if (tp->ack.pending == 0) {
/* We sent a data segment already. */
return;
}
@@ -1975,7 +2192,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
*/
if (tp->urg_seq == tp->copied_seq)
tp->copied_seq++; /* Move the copied sequence on correctly */
- tp->urg_data = URG_NOTYET;
+ tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
/* Disable header prediction. */
@@ -1992,12 +2209,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
tcp_check_urg(sk,th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == URG_NOTYET) {
+ if (tp->urg_data == TCP_URG_NOTYET) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
/* Is the urgent pointer pointing into this packet? */
if (ptr < len) {
- tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+ tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
if (!sk->dead)
sk->data_ready(sk,0);
}
@@ -2014,7 +2231,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
static int prune_queue(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct sk_buff * skb;
+ struct sk_buff *skb;
+ int pruned = 0;
SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
@@ -2024,7 +2242,9 @@ static int prune_queue(struct sock *sk)
skb = __skb_dequeue_tail(&tp->out_of_order_queue);
if(skb != NULL) {
/* Free it all. */
- do { net_statistics[smp_processor_id()*2].OfoPruned += skb->len;
+ do {
+ pruned += skb->len;
+ net_statistics[smp_processor_id()*2].OfoPruned += skb->len;
kfree_skb(skb);
skb = __skb_dequeue_tail(&tp->out_of_order_queue);
} while(skb != NULL);
@@ -2059,13 +2279,47 @@ static int prune_queue(struct sock *sk)
* if we are really having our buffer space abused we stop accepting
* new receive data.
*
+ * 8) The arguments are interesting, but I even cannot imagine
+ * what kind of arguments could force us to drop NICE, ALREADY
+ * RECEIVED DATA only to get one more packet? --ANK
+ *
* FIXME: it should recompute SACK state and only remove enough
* buffers to get into bounds again. The current scheme loses
- * badly sometimes on links with large RTT, especially when
- * the driver has high overhead per skb.
- * (increasing the rcvbuf is not enough because it inflates the
- * the window too, disabling flow control effectively) -AK
+ * badly sometimes on links with large RTT, especially when
+ * the driver has high overhead per skb.
+ * (increasing the rcvbuf is not enough because it inflates the
+ * the window too, disabling flow control effectively) -AK
+ *
+ * Mmm... Why not to scale it seprately then? Just replace
+ * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
+ * and adjust it dynamically, when TCP window flow control
+ * fails? -ANK
+ */
+
+ /* F.e. one possible tactics is: */
+ do {
+ u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned;
+
+ /* This guy is not a good guy. I bet, he martirized cats,
+ * when was child and grew up to finished sadist. Clamp him!
+ */
+ if (new_clamp > 3*tp->ack.rcv_mss)
+ new_clamp -= tp->ack.rcv_mss;
+ else
+ new_clamp = 2*tp->ack.rcv_mss;
+ tp->window_clamp = min(tp->window_clamp, new_clamp);
+ } while (0);
+ /* Though it should be made earlier, when we are still not
+ * congested. This header prediction logic sucks
+ * without true implementation of VJ algorithm.
+ * I am really anxious. How was it possible to combine
+ * header prediction and sending ACKs outside of recvmsg() context?
+ * They _are_ incompatible. We should not advance window so
+ * brainlessly and we should not advertise so huge window from the very
+ * beginning. BTW window "prediction" does not speedup anything!
+ * SIlly, silly, silly.
*/
+
if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
return 0;
@@ -2073,6 +2327,57 @@ static int prune_queue(struct sock *sk)
return -1;
}
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int chunk = skb->len - hlen;
+ int err;
+
+ local_bh_enable();
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+ err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
+ else
+ err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
+
+ if (!err) {
+update:
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ local_bh_disable();
+ return 0;
+ }
+
+ if (err == -EFAULT) {
+ sk->err = EFAULT;
+ sk->error_report(sk);
+ goto update;
+ }
+
+ local_bh_disable();
+ return err;
+}
+
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+ int result;
+
+ if (sk->lock.users) {
+ local_bh_enable();
+ result = __tcp_checksum_complete(skb);
+ local_bh_disable();
+ } else {
+ result = __tcp_checksum_complete(skb);
+ }
+ return result;
+}
+
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ __tcp_checksum_complete_user(sk, skb);
+}
+
/*
* TCP receive function for the ESTABLISHED state.
*
@@ -2080,7 +2385,33 @@ static int prune_queue(struct sock *sk)
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
- * - Out of order segments arrived.
+ * [ NOTE: actually, it was made incorrectly and nobody ever noticed
+ * this! Reason is clear: 1. Correct senders do not send
+ * to zero window. 2. Even if a sender sends to zero window,
+ * nothing terrible occurs.
+ *
+ * For now I cleaned this and fast path is really always disabled,
+ * when window is zero, but I would be more happy to remove these
+ * checks. Code will be only cleaner and _faster_. --ANK
+ *
+ * Later note. I've just found that slow path also accepts
+ * out of window segments, look at tcp_sequence(). So...
+ * it is the last argument: I repair all and comment out
+ * repaired code by TCP_FORMAL_WINDOW.
+ * [ I remember one rhyme from a chidren's book. (I apologize,
+ * the trasnlation is not rhymed 8)): people in one (jewish) village
+ * decided to build sauna, but divided to two parties.
+ * The first one insisted that battens should not be dubbed,
+ * another objected that foots will suffer of splinters,
+ * the first fended that dubbed wet battens are too slippy
+ * and people will fall and it is much more serious!
+ * Certaiinly, all they went to rabbi.
+ * After some thinking, he judged: "Do not be lazy!
+ * Certainly, dub the battens! But put them by dubbed surface down."
+ * ]
+ * ]
+ *
+ * - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
@@ -2088,7 +2419,7 @@ static int prune_queue(struct sock *sk)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
- * - Unexpected TCP option.
+ * - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
@@ -2116,7 +2447,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
* We do checksum and copy also but from device to kernel.
*/
-
/* RED-PEN. Using static variables to pass function arguments
* cannot be good idea...
*/
@@ -2133,13 +2463,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- int tcp_header_len = th->doff*4;
-
- /* Timestamp header prediction */
+ int tcp_header_len = tp->tcp_header_len;
- /* Non-standard header f.e. SACKs -> slow path */
- if (tcp_header_len != tp->tcp_header_len)
- goto slow_path;
+ /* Timestamp header prediction: tcp_header_len
+ * is automatically equal to th->doff*4 due to pred_flags
+ * match.
+ */
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
@@ -2161,8 +2490,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto slow_path;
/* Predicted packet is in window by definition.
- seq == rcv_nxt and last_ack_sent <= rcv_nxt.
- Hence, check seq<=last_ack_sent reduces to:
+ * seq == rcv_nxt and last_ack_sent <= rcv_nxt.
+ * Hence, check seq<=last_ack_sent reduces to:
*/
if (tp->rcv_nxt == tp->last_ack_sent) {
tp->ts_recent = tp->rcv_tsval;
@@ -2173,6 +2502,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
+ /* We know that such packets are checksummed
+ * on entry.
+ */
tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->ack_seq, len);
kfree_skb(skb);
@@ -2182,19 +2514,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
TCP_INC_STATS_BH(TcpInErrs);
goto discard;
}
- } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
- atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
- /* Bulk data transfer: receiver */
- __skb_pull(skb,tcp_header_len);
+ } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
+ int eaten = 0;
- /* Is it possible to simplify this? */
- tcp_measure_rcv_mss(sk, skb);
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sk->lock.users) {
+ eaten = 1;
+
+ NET_INC_STATS_BH(TCPHPHitsToUser);
+
+ if (tcp_copy_to_iovec(sk, skb, tcp_header_len))
+ goto csum_error;
+
+ __skb_pull(skb,tcp_header_len);
+ } else {
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
+ goto step5;
+
+ NET_INC_STATS_BH(TCPHPHits);
+
+ /* Bulk data transfer: receiver */
+ __skb_pull(skb,tcp_header_len);
+
+ /* DO NOT notify forward progress here.
+ * It saves dozen of CPU instructions in fast path. --ANK
+ * And where is it signaled then ? -AK
+ * Nowhere. 8) --ANK
+ */
+ __skb_queue_tail(&sk->receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
- /* DO NOT notify forward progress here.
- * It saves dozen of CPU instructions in fast path. --ANK
- * And where is it signaled then ? -AK
- */
- __skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
/* FIN bit check is not done since if FIN is set in
@@ -2202,27 +2557,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
wake_up_interruptible(sk->sleep);
sock_wake_async(sk->socket,1, POLL_IN);
- tcp_delack_estimator(tp);
- tcp_remember_ack(tp, th, skb);
+ tcp_event_data_recv(tp, skb);
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+ if (eaten) {
+ if (tcp_in_quickack_mode(tp)) {
+ tcp_send_ack(sk);
+ } else {
+ tcp_send_delayed_ack(sk);
+ }
+ } else
+#endif
__tcp_ack_snd_check(sk, 0);
+
+ if (eaten)
+ kfree_skb(skb);
return 0;
}
/* Packet is in sequence, flags are trivial;
- * only ACK is strange or we are tough on memory.
- * Jump to step 5.
+ * only ACK is strange. Jump to step 5.
*/
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
goto step5;
}
slow_path:
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
/*
* RFC1323: H1. Apply PAWS check first.
*/
if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
tcp_paws_discard(tp, skb)) {
if (!th->rst) {
+ NET_INC_STATS_BH(PAWSEstabRejected);
tcp_send_ack(sk);
goto discard;
}
@@ -2251,7 +2622,9 @@ slow_path:
TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tp->rcv_wup, tp->rcv_wnd);
}
+ tcp_enter_quickack_mode(tp);
tcp_send_ack(sk);
+ NET_INC_STATS_BH(DelayedACKLost);
goto discard;
}
@@ -2279,11 +2652,8 @@ step5:
/* Process urgent data. */
tcp_urg(sk, th, len);
- {
/* step 7: process the segment text */
- int queued = tcp_data(skb, sk, len);
-
- tcp_measure_rcv_mss(sk, skb);
+ tcp_data(skb, sk, len);
/* Be careful, tcp_data() may have put this into TIME_WAIT. */
if(sk->state != TCP_CLOSE) {
@@ -2291,12 +2661,13 @@ step5:
tcp_ack_snd_check(sk);
}
- if (!queued) {
- discard:
- kfree_skb(skb);
- }
- }
+ return 0;
+
+csum_error:
+ TCP_INC_STATS_BH(TcpInErrs);
+discard:
+ kfree_skb(skb);
return 0;
}
@@ -2328,6 +2699,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newsk->dport = req->rmt_port;
sock_lock_init(newsk);
+ bh_lock_sock(newsk);
atomic_set(&newsk->rmem_alloc, 0);
skb_queue_head_init(&newsk->receive_queue);
@@ -2351,22 +2723,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->rcv_nxt = req->rcv_isn + 1;
newtp->snd_nxt = req->snt_isn + 1;
newtp->snd_una = req->snt_isn + 1;
- newtp->srtt = 0;
- newtp->ato = 0;
+ newtp->snd_sml = req->snt_isn + 1;
+
+ tcp_delack_init(newtp);
+ if (skb->len >= 536)
+ newtp->ack.last_seg_size = skb->len;
+
+ tcp_prequeue_init(newtp);
+
newtp->snd_wl1 = req->rcv_isn;
newtp->snd_wl2 = req->snt_isn;
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- newtp->snd_wnd = ntohs(skb->h.th->window);
-
- newtp->max_window = newtp->snd_wnd;
- newtp->pending = 0;
newtp->retransmits = 0;
- newtp->last_ack_sent = req->rcv_isn + 1;
newtp->backoff = 0;
+ newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->rto = TCP_TIMEOUT_INIT;
+
+ newtp->packets_out = 0;
+ newtp->fackets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -2374,22 +2751,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
* efficiently to them. -DaveM
*/
newtp->snd_cwnd = 2;
-
- newtp->rto = TCP_TIMEOUT_INIT;
- newtp->packets_out = 0;
- newtp->fackets_out = 0;
- newtp->retrans_out = 0;
- newtp->high_seq = 0;
- newtp->snd_ssthresh = 0x7fffffff;
newtp->snd_cwnd_cnt = 0;
+ newtp->high_seq = 0;
+
newtp->dup_acks = 0;
- newtp->delayed_acks = 0;
- init_timer(&newtp->retransmit_timer);
- newtp->retransmit_timer.function = &tcp_retransmit_timer;
- newtp->retransmit_timer.data = (unsigned long) newsk;
- init_timer(&newtp->delack_timer);
- newtp->delack_timer.function = &tcp_delack_timer;
- newtp->delack_timer.data = (unsigned long) newsk;
+ tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
newtp->send_head = newtp->retrans_head = NULL;
newtp->rcv_wup = req->rcv_isn + 1;
@@ -2397,31 +2763,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->copied_seq = req->rcv_isn + 1;
newtp->saw_tstamp = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
- init_timer(&newtp->probe_timer);
- newtp->probe_timer.function = &tcp_probe_timer;
- newtp->probe_timer.data = (unsigned long) newsk;
newtp->probes_out = 0;
newtp->syn_seq = req->rcv_isn;
newtp->fin_seq = req->rcv_isn;
newtp->urg_data = 0;
- tcp_synq_init(newtp);
- newtp->syn_backlog = 0;
- if (skb->len >= 536)
- newtp->last_seg_size = skb->len;
+ newtp->listen_opt = NULL;
+ newtp->accept_queue = NULL;
+ /* Deinitialize syn_wait_lock to trap illegal accesses. */
+ memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
/* Back to base struct sock members. */
newsk->err = 0;
- newsk->ack_backlog = 0;
- newsk->max_ack_backlog = SOMAXCONN;
newsk->priority = 0;
atomic_set(&newsk->refcnt, 1);
+#ifdef INET_REFCNT_DEBUG
atomic_inc(&inet_sock_nr);
+#endif
- spin_lock_init(&sk->timer_lock);
- init_timer(&newsk->timer);
- newsk->timer.function = &tcp_keepalive_timer;
- newsk->timer.data = (unsigned long) newsk;
if (newsk->keepopen)
tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
newsk->socket = NULL;
@@ -2440,6 +2800,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->snd_wscale = newtp->rcv_wscale = 0;
newtp->window_clamp = min(newtp->window_clamp,65535);
}
+ newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
+ newtp->max_window = newtp->snd_wnd;
+
if (newtp->tstamp_ok) {
newtp->ts_recent = req->ts_recent;
newtp->ts_recent_stamp = xtime.tv_sec;
@@ -2453,16 +2816,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
return newsk;
}
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
-{
- if (seq == s_win)
- return 1;
- if (after(end_seq, s_win) && before(seq, e_win))
- return 1;
- return (seq == e_win && seq == end_seq);
-}
-
-
/*
* Process an incoming packet for SYN_RECV sockets represented
* as an open_request.
@@ -2470,30 +2823,28 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct open_request *req,
- struct open_request *prev)
+ struct open_request **prev)
{
struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
struct tcp_opt ttp;
-
- /* If socket has already been created, process
- packet in its context.
-
- We fall here only due to race, when packets were enqueued
- to backlog of listening socket.
- */
- if (req->sk)
- return req->sk;
+ struct sock *child;
ttp.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
-
tcp_parse_options(NULL, th, &ttp, 0);
- paws_reject = ttp.saw_tstamp &&
- (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
+ if (ttp.saw_tstamp) {
+ ttp.ts_recent = req->ts_recent;
+ /* We do not store true stamp, but it is not required,
+ * it can be estimated (approximately)
+ * from another data.
+ */
+ ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+ paws_reject = tcp_paws_check(&ttp, th->rst);
+ }
}
/* Check for pure retransmited SYN. */
@@ -2517,7 +2868,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*/
- req->class->rtx_syn_ack(sk, req);
+ req->class->rtx_syn_ack(sk, req, NULL);
return NULL;
}
@@ -2544,6 +2895,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->class->send_ack(skb, req);
+ if (paws_reject)
+ NET_INC_STATS_BH(PAWSEstabRejected);
return NULL;
}
@@ -2572,35 +2925,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* Invalid ACK: reset will be sent by listening socket */
if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
return sk;
-
- /* OK, ACK is valid, create big socket and
- feed this segment to it. It will repeat all
- the tests. THIS SEGMENT MUST MOVE SOCKET TO
- ESTABLISHED STATE. If it will be dropped after
- socket is created, wait for troubles.
+ /* Also, it would be not so bad idea to check rcv_tsecr, which
+ * is essentially ACK extension and too early or too late values
+ * should cause reset in unsynchronized states.
*/
- sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
- if (sk == NULL)
+
+ /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+ if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+ req->acked = 1;
return NULL;
+ }
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- req->sk = sk;
- return sk;
+ /* OK, ACK is valid, create big socket and
+ * feed this segment to it. It will repeat all
+ * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ * ESTABLISHED STATE. If it will be dropped after
+ * socket is created, wait for troubles.
+ */
+ child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ goto listen_overflow;
-embryonic_reset:
tcp_synq_unlink(tp, req, prev);
- tp->syn_backlog--;
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ tcp_synq_removed(sk, req);
+
+ tcp_acceptq_queue(sk, req, child);
+ return child;
+listen_overflow:
+ if (!sysctl_tcp_abort_on_overflow) {
+ req->acked = 1;
+ return NULL;
+ }
+
+embryonic_reset:
NET_INC_STATS_BH(EmbryonicRsts);
if (!(flg & TCP_FLAG_RST))
req->class->send_reset(skb);
- req->class->destructor(req);
- tcp_openreq_free(req);
+ tcp_synq_drop(sk, req, prev);
return NULL;
}
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ int state = child->state;
+
+ if (child->lock.users == 0) {
+ ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+
+ /* Wakeup parent, send SIGIO */
+ if (state == TCP_SYN_RECV && child->state != state)
+ parent->data_ready(parent, 0);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ return ret;
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
@@ -2608,25 +3004,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(sk, th, tp, 0);
-#ifdef CONFIG_TCP_TW_RECYCLE
- if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
- (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
- xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
- /* Old duplicate segment. We remember last
- ts_recent from this host in timewait bucket.
-
- Actually, we could implement per host cache
- to truncate timewait state after RTO. Paranoidal arguments
- of rfc1337 are not enough to close this nice possibility.
- */
- if (net_ratelimit())
- printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
- if (th->ack)
- return 1;
- goto discard;
- }
-#endif
-
if (th->ack) {
/* rfc793:
* "If the state is SYN-SENT then
@@ -2646,10 +3023,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* We do not send data with SYN, so that RFC-correct
* test reduces to:
*/
- if (sk->zapped ||
- TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
return 1;
+ /* Check not from any RFC, but it is evident consequence
+ * of combining PAWS and usual SYN-SENT logic: ACK _is_
+ * checked in SYN-SENT unlike another states, hence
+ * echoed tstamp must be checked too.
+ */
+ if (tp->saw_tstamp) {
+ if (tp->rcv_tsecr == 0) {
+ /* Workaround for bug in linux-2.1 and early
+ * 2.2 kernels. Let's pretend that we did not
+ * see such timestamp to avoid bogus rtt value,
+ * calculated by tcp_ack().
+ */
+ tp->saw_tstamp = 0;
+
+ /* But do not forget to store peer's timestamp! */
+ if (th->syn) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
+ }
+ } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 ||
+ (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) {
+ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n"));
+ NET_INC_STATS_BH(PAWSActiveRejected);
+ return 1;
+ }
+ }
+
/* Now ACK is acceptable.
*
* "If the RST bit is set
@@ -2689,18 +3092,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* because tcp_ack check is too weak for SYN-SENT)
* causes moving socket to invalid semi-SYN-SENT,
* semi-ESTABLISHED state and connection hangs.
- *
- * There exist buggy stacks, which really send
- * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
- * Actually, if this host did not try to get something
- * from ftp.inr.ac.ru I'd never find this bug 8)
- *
* --ANK (990514)
*
- * I was wrong, I apologize. Bare ACK is valid.
+ * Bare ACK is valid, however.
* Actually, RFC793 requires to send such ACK
* in reply to any out of window packet.
- * It is wrong, but Linux also does it sometimes.
+ * It is wrong, but Linux also send such
+ * useless ACKs sometimes.
* --ANK (990724)
*/
@@ -2717,7 +3115,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* RFC1323: The window in SYN & SYN/ACK segments is
* never scaled.
*/
- tp->snd_wnd = htons(th->window);
+ tp->snd_wnd = ntohs(th->window);
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
tp->fin_seq = TCP_SKB_CB(skb)->seq;
@@ -2742,26 +3140,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(sk);
tcp_init_metrics(sk);
+ if (sk->keepopen)
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ tp->copied_seq = tp->rcv_nxt;
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+
+ if(!sk->dead) {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 0, POLL_OUT);
+ }
+
if (tp->write_pending) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
- * How to make this correctly?
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
*/
- tp->delayed_acks++;
- if (tp->ato == 0)
- tp->ato = tp->rto;
- tcp_send_delayed_ack(sk, tp->rto);
+ tp->ack.pending = 1;
+ tp->ack.lrcvtime = tcp_time_stamp;
+ tcp_enter_quickack_mode(tp);
+ tp->ack.pingpong = 1;
+ tp->ack.ato = TCP_ATO_MIN;
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
+ goto discard;
} else {
tcp_send_ack(sk);
}
-
- tp->copied_seq = tp->rcv_nxt;
-
- if(!sk->dead) {
- wake_up_interruptible(sk->sleep);
- sock_wake_async(sk->socket, 0, POLL_OUT);
- }
return -1;
}
@@ -2777,6 +3184,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
}
+ /* PAWS check. */
+ if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0))
+ goto discard;
+
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
@@ -2800,8 +3211,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* RFC1323: The window in SYN & SYN/ACK segments is
* never scaled.
*/
- tp->snd_wnd = htons(th->window);
+ tp->snd_wnd = ntohs(th->window);
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->max_window = tp->snd_wnd;
tcp_sync_mss(sk, tp->pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -2960,6 +3372,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
#endif
) {
if (!th->rst) {
+ NET_INC_STATS_BH(DelayedACKLost);
+ tcp_enter_quickack_mode(tp);
tcp_send_ack(sk);
}
goto discard;
@@ -3011,28 +3425,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->copied_seq = tp->rcv_nxt;
/* Note, that this wakeup is only for marginal
- crossed SYN case. Passively open sockets
- are not waked up, because sk->sleep == NULL
- and sk->socket == NULL.
+ * crossed SYN case. Passively open sockets
+ * are not waked up, because sk->sleep == NULL
+ * and sk->socket == NULL.
*/
- if (!sk->dead && sk->sleep) {
- wake_up_interruptible(sk->sleep);
+ if (!sk->dead) {
+ sk->state_change(sk);
sock_wake_async(sk->socket,0,POLL_OUT);
}
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
- tp->snd_wnd = htons(th->window) << tp->snd_wscale;
+ tp->snd_wnd = ntohs(th->window) << tp->snd_wscale;
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
/* tcp_ack considers this ACK as duplicate
- * and does not calculate rtt. It is wrong.
+ * and does not calculate rtt.
* Fix it at least with timestamps.
*/
if (tp->saw_tstamp && !tp->srtt)
tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
tcp_init_metrics(sk);
+ tcp_fast_path_on(tp);
} else {
SOCK_DEBUG(sk, "bad ack\n");
return 1;
@@ -3041,26 +3456,50 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
case TCP_FIN_WAIT1:
if (tp->snd_una == tp->write_seq) {
- sk->shutdown |= SEND_SHUTDOWN;
tcp_set_state(sk, TCP_FIN_WAIT2);
- if (!sk->dead)
- sk->state_change(sk);
- else
- tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
+ sk->shutdown |= SEND_SHUTDOWN;
dst_confirm(sk->dst_cache);
+
+ if (!sk->dead) {
+ /* Wake up lingering close() */
+ sk->state_change(sk);
+ } else {
+ int tmo;
+
+ if (tp->linger2 < 0 ||
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ tcp_done(sk);
+ return 1;
+ }
+
+ tmo = tcp_fin_time(tp);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sk->lock.users) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ tcp_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ }
}
break;
- case TCP_CLOSING:
+ case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
- tcp_time_wait(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break;
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
- tcp_set_state(sk,TCP_CLOSE);
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
@@ -3080,27 +3519,22 @@ step6:
case TCP_CLOSING:
if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
break;
-
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
- if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+ if (sk->shutdown & RCV_SHUTDOWN) {
if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
tcp_reset(sk);
return 1;
}
}
-
+ /* Fall through */
case TCP_ESTABLISHED:
- queued = tcp_data(skb, sk, len);
-
- /* This must be after tcp_data() does the skb_pull() to
- * remove the header size from skb->len.
- */
- tcp_measure_rcv_mss(sk, skb);
+ tcp_data(skb, sk, len);
+ queued = 1;
break;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22c35a191..7420e268f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.194 2000/01/09 02:19:41 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $
*
* IPv4 specific functions
*
@@ -52,7 +52,6 @@
#include <linux/fcntl.h>
#include <linux/random.h>
#include <linux/init.h>
-#include <linux/ipsec.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -61,15 +60,9 @@
#include <linux/inet.h>
#include <linux/stddef.h>
+#include <linux/ipsec.h>
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_syncookies;
-extern int sysctl_tcp_tw_recycle;
extern int sysctl_ip_dynaddr;
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
/* Check TCP sequence numbers in ICMP packets. */
#define ICMP_MIN_LENGTH 8
@@ -319,89 +312,13 @@ void tcp_put_port(struct sock *sk)
local_bh_enable();
}
-#ifdef CONFIG_TCP_TW_RECYCLE
-/*
- Very stupid pseudo-"algoritm". If the approach will be successful
- (and it will!), we have to make it more reasonable.
- Now it eats lots of CPU, when we are tough on ports.
-
- Apparently, it should be hash table indexed by daddr/dport.
-
- How does it work? We allow to truncate time-wait state, if:
- 1. PAWS works on it.
- 2. timewait bucket did not receive data for timeout:
- - initially timeout := 2*RTO, so that if our ACK to first
- transmitted peer's FIN is lost, we will see first retransmit.
- - if we receive anything, the timout is increased exponentially
- to follow normal TCP backoff pattern.
- It is important that minimal RTO (HZ/5) > minimal timestamp
- step (1ms).
- 3. When creating new socket, we inherit sequence number
- and ts_recent of time-wait bucket, increasinf them a bit.
-
- These two conditions guarantee, that data will not be corrupted
- both by retransmitted and by delayed segments. They do not guarantee
- that peer will leave LAST-ACK/CLOSING state gracefully, it will be
- reset sometimes, namely, when more than two our ACKs to its FINs are lost.
- This reset is harmless and even good.
+/* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
*/
-int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
-{
- static int tw_rover;
-
- struct tcp_tw_bucket *tw;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
-
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- unsigned long now = jiffies;
- int i, rover;
-
- rover = tw_rover;
-
- local_bh_disable();
- for (i=0; i<tcp_bhash_size; i++, rover++) {
- rover &= (tcp_bhash_size-1);
- head = &tcp_bhash[rover];
-
- spin_lock(&head->lock);
- for (tb = head->chain; tb; tb = tb->next) {
- tw = (struct tcp_tw_bucket*)tb->owners;
-
- if (tw->state != TCP_TIME_WAIT ||
- tw->dport != dport ||
- tw->daddr != daddr ||
- tw->rcv_saddr != sk->rcv_saddr ||
- tb->port < low ||
- tb->port >= high ||
- !TCP_INET_FAMILY(tw->family) ||
- tw->ts_recent_stamp == 0 ||
- (long)(now - tw->ttd) <= 0)
- continue;
- tw_rover = rover;
- goto hit;
- }
- spin_unlock(&head->lock);
- }
- local_bh_enable();
- tw_rover = rover;
- return -EAGAIN;
-
-hit:
- sk->num = tw->num;
- if ((sk->bind_next = tb->owners) != NULL)
- tb->owners->bind_pprev = &sk->bind_next;
- tb->owners = sk;
- sk->bind_pprev = &tb->owners;
- sk->prev = (struct sock *) tb;
- spin_unlock_bh(&head->lock);
- return 0;
-}
-#endif
-
-
void tcp_listen_wlock(void)
{
write_lock(&tcp_lhash_lock);
@@ -409,9 +326,9 @@ void tcp_listen_wlock(void)
if (atomic_read(&tcp_lhash_users)) {
DECLARE_WAITQUEUE(wait, current);
- add_wait_queue(&tcp_lhash_wait, &wait);
+ add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
if (atomic_read(&tcp_lhash_users) == 0)
break;
write_unlock_bh(&tcp_lhash_lock);
@@ -445,6 +362,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk)
sk->pprev = skp;
sock_prot_inc_use(sk->prot);
write_unlock(lock);
+ if (sk->state == TCP_LISTEN)
+ wake_up(&tcp_lhash_wait);
}
static void tcp_v4_hash(struct sock *sk)
@@ -478,6 +397,8 @@ void tcp_unhash(struct sock *sk)
sock_prot_dec_use(sk->prot);
}
write_unlock_bh(lock);
+ if (sk->state == TCP_LISTEN)
+ wake_up(&tcp_lhash_wait);
}
/* Don't inline this cruft. Here are some nice properties to
@@ -546,8 +467,9 @@ sherry_cache:
*
* Local BH must be disabled here.
*/
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
- u32 daddr, u16 hnum, int dif)
+
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+ u32 daddr, u16 hnum, int dif)
{
struct tcp_ehash_bucket *head;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -572,7 +494,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
goto hit;
read_unlock(&head->lock);
- return tcp_v4_lookup_listener(daddr, hnum, dif);
+ return NULL;
hit:
sock_hold(sk);
@@ -580,6 +502,19 @@ hit:
return sk;
}
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+ u32 daddr, u16 hnum, int dif)
+{
+ struct sock *sk;
+
+ sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
+
+ if (sk)
+ return sk;
+
+ return tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+
__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
{
struct sock *sk;
@@ -609,21 +544,16 @@ static int tcp_v4_check_established(struct sock *sk)
int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
struct tcp_ehash_bucket *head = &tcp_ehash[hash];
struct sock *sk2, **skp;
-#ifdef CONFIG_TCP_TW_RECYCLE
struct tcp_tw_bucket *tw;
-#endif
write_lock_bh(&head->lock);
/* Check TIME-WAIT sockets first. */
for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
skp = &sk2->next) {
-#ifdef CONFIG_TCP_TW_RECYCLE
tw = (struct tcp_tw_bucket*)sk2;
-#endif
if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
-#ifdef CONFIG_TCP_TW_RECYCLE
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* With PAWS, it is safe from the viewpoint
@@ -631,12 +561,17 @@ static int tcp_v4_check_established(struct sock *sk)
is safe provided sequence spaces do not
overlap i.e. at data rates <= 80Mbit/sec.
- Actually, the idea is close to VJ's (rfc1332)
- one, only timestamp cache is held not per host,
+ Actually, the idea is close to VJ's one,
+ only timestamp cache is held not per host,
but per port pair and TW bucket is used
as state holder.
+
+ If TW bucket has been already destroyed we
+ fall back to VJ's scheme and use initial
+ timestamp retrieved from peer table.
*/
- if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
+ if (tw->substate == TCP_TIME_WAIT &&
+ sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
if ((tp->write_seq = tw->snd_nxt + 2) == 0)
tp->write_seq = 1;
tp->ts_recent = tw->ts_recent;
@@ -645,13 +580,10 @@ static int tcp_v4_check_established(struct sock *sk)
skp = &head->chain;
goto unique;
} else
-#endif
- goto not_unique;
+ goto not_unique;
}
}
-#ifdef CONFIG_TCP_TW_RECYCLE
tw = NULL;
-#endif
/* And established part... */
for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
@@ -659,9 +591,7 @@ static int tcp_v4_check_established(struct sock *sk)
goto not_unique;
}
-#ifdef CONFIG_TCP_TW_RECYCLE
unique:
-#endif
BUG_TRAP(sk->pprev==NULL);
if ((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
@@ -671,17 +601,17 @@ unique:
sock_prot_inc_use(sk->prot);
write_unlock_bh(&head->lock);
-#ifdef CONFIG_TCP_TW_RECYCLE
if (tw) {
/* Silly. Should hash-dance instead... */
local_bh_disable();
tcp_tw_deschedule(tw);
tcp_timewait_kill(tw);
+ NET_INC_STATS_BH(TimeWaitRecycled);
local_bh_enable();
tcp_tw_put(tw);
}
-#endif
+
return 0;
not_unique:
@@ -727,9 +657,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int tmp;
int err;
- if (sk->state != TCP_CLOSE)
- return(-EISCONN);
-
if (addr_len < sizeof(struct sockaddr_in))
return(-EINVAL);
@@ -759,8 +686,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
daddr = rt->rt_dst;
err = -ENOBUFS;
- buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
- 0, GFP_KERNEL);
+ buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL);
if (buff == NULL)
goto failure;
@@ -769,27 +695,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->saddr = rt->rt_src;
sk->rcv_saddr = sk->saddr;
- if (!sk->num) {
- if (sk->prot->get_port(sk, 0)
-#ifdef CONFIG_TCP_TW_RECYCLE
- && (!sysctl_tcp_tw_recycle ||
- tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
-#endif
- ) {
- kfree_skb(buff);
- err = -EAGAIN;
- goto failure;
- }
- sk->sport = htons(sk->num);
- }
-#ifdef CONFIG_TCP_TW_RECYCLE
- else if (tp->ts_recent_stamp && sk->daddr != daddr) {
+ if (tp->ts_recent_stamp && sk->daddr != daddr) {
/* Reset inherited state */
tp->ts_recent = 0;
tp->ts_recent_stamp = 0;
tp->write_seq = 0;
}
-#endif
+
+ if (sysctl_tcp_tw_recycle &&
+ !tp->ts_recent_stamp &&
+ rt->rt_dst == daddr) {
+ struct inet_peer *peer = rt_get_peer(rt);
+
+ /* VJ's idea. We save last timestamp seen from
+ * the destination in peer table, when entering state TIME-WAIT
+ * and initialize ts_recent from it, when trying new connection.
+ */
+
+ if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+ tp->ts_recent_stamp = peer->tcp_ts_stamp;
+ tp->ts_recent = peer->tcp_ts;
+ }
+ }
sk->dport = usin->sin_port;
sk->daddr = daddr;
@@ -814,85 +741,62 @@ failure:
return err;
}
-static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
{
- int retval = -EINVAL;
-
- lock_sock(sk);
-
- /* Do sanity checking for sendmsg/sendto/send. */
- if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
- goto out;
- if (msg->msg_name) {
- struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
-
- if (msg->msg_namelen < sizeof(*addr))
- goto out;
- if (addr->sin_family && addr->sin_family != AF_INET)
- goto out;
- retval = -ENOTCONN;
- if(sk->state == TCP_CLOSE)
- goto out;
- retval = -EISCONN;
- if (addr->sin_port != sk->dport)
- goto out;
- if (addr->sin_addr.s_addr != sk->daddr)
- goto out;
- }
- retval = tcp_do_sendmsg(sk, msg);
-
-out:
- release_sock(sk);
- return retval;
+ return ((struct rtable*)skb->dst)->rt_iif;
}
+static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
+{
+ unsigned h = raddr ^ rport;
+ h ^= h>>16;
+ h ^= h>>8;
+ return h&(TCP_SYNQ_HSIZE-1);
+}
-/*
- * Do a linear search in the socket open_request list.
- * This should be replaced with a global hash table.
- */
static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
struct iphdr *iph,
struct tcphdr *th,
- struct open_request **prevp)
+ struct open_request ***prevp)
{
- struct open_request *req, *prev;
- __u16 rport = th->source;
-
- /* assumption: the socket is not in use.
- * as we checked the user count on tcp_rcv and we're
- * running from a soft interrupt.
- */
- prev = (struct open_request *) (&tp->syn_wait_queue);
- for (req = prev->dl_next; req; req = req->dl_next) {
- if (req->af.v4_req.rmt_addr == iph->saddr &&
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ struct open_request *req, **prev;
+ __u16 rport = th->source;
+ __u32 raddr = iph->saddr;
+
+ for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ if (req->rmt_port == rport &&
+ req->af.v4_req.rmt_addr == raddr &&
req->af.v4_req.loc_addr == iph->daddr &&
- req->rmt_port == rport &&
TCP_INET_FAMILY(req->class->family)) {
- if (req->sk) {
- /* Weird case: connection was established
- and then killed by RST before user accepted
- it. This connection is dead, but we cannot
- kill openreq to avoid blocking in accept().
-
- accept() will collect this garbage,
- but such reqs must be ignored, when talking
- to network.
- */
- bh_lock_sock(req->sk);
- BUG_TRAP(req->sk->lock.users==0);
- if (req->sk->state == TCP_CLOSE) {
- bh_unlock_sock(req->sk);
- prev = req;
- continue;
- }
- }
+ BUG_TRAP(req->sk == NULL);
*prevp = prev;
return req;
}
- prev = req;
}
- return NULL;
+
+ return NULL;
+}
+
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
+
+ req->expires = jiffies + TCP_TIMEOUT_INIT;
+ req->retrans = 0;
+ req->sk = NULL;
+ req->index = h;
+ req->dl_next = lopt->syn_table[h];
+
+ write_lock(&tp->syn_wait_lock);
+ lopt->syn_table[h] = req;
+ write_unlock(&tp->syn_wait_lock);
+
+ tcp_synq_added(sk);
}
@@ -984,7 +888,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
th = (struct tcphdr*)(dp+(iph->ihl<<2));
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
+ sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
if (sk == NULL) {
ICMP_INC_STATS_BH(IcmpInErrors);
return;
@@ -1001,6 +905,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
if (sk->lock.users != 0)
NET_INC_STATS_BH(LockDroppedIcmps);
+ if (sk->state == TCP_CLOSE)
+ goto out;
+
tp = &sk->tp_pinfo.af_tcp;
seq = ntohl(th->seq);
if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
@@ -1010,14 +917,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
switch (type) {
case ICMP_SOURCE_QUENCH:
-#ifndef OLD_SOURCE_QUENCH /* This is deprecated */
- if (sk->lock.users == 0) {
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->snd_cwnd_cnt = 0;
- tp->high_seq = tp->snd_nxt;
- }
-#endif
+ /* This is deprecated, but if someone generated it,
+ * we have no reasons to ignore it.
+ */
+ if (sk->lock.users == 0)
+ tcp_enter_cong_avoid(tp);
goto out;
case ICMP_PARAMETERPROB:
err = EPROTO;
@@ -1042,7 +946,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
}
switch (sk->state) {
- struct open_request *req, *prev;
+ struct open_request *req, **prev;
case TCP_LISTEN:
if (sk->lock.users != 0)
goto out;
@@ -1060,47 +964,25 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
if (!req)
goto out;
- if (req->sk) {
- struct sock *nsk = req->sk;
-
- /*
- * Already in ESTABLISHED and a big socket is created,
- * set error code there.
- * The error will _not_ be reported in the accept(),
- * but only with the next operation on the socket after
- * accept.
- */
- sock_hold(nsk);
- bh_unlock_sock(sk);
- sock_put(sk);
- sk = nsk;
-
- BUG_TRAP(sk->lock.users == 0);
- tp = &sk->tp_pinfo.af_tcp;
- if (!between(seq, tp->snd_una, tp->snd_nxt)) {
- NET_INC_STATS(OutOfWindowIcmps);
- goto out;
- }
- } else {
- if (seq != req->snt_isn) {
- NET_INC_STATS(OutOfWindowIcmps);
- goto out;
- }
+ /* ICMPs are not backlogged, hence we cannot get
+ an established socket here.
+ */
+ BUG_TRAP(req->sk == NULL);
- /*
- * Still in SYN_RECV, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- tp->syn_backlog--;
- tcp_synq_unlink(tp, req, prev);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- req->class->destructor(req);
- tcp_openreq_free(req);
+ if (seq != req->snt_isn) {
+ NET_INC_STATS_BH(OutOfWindowIcmps);
goto out;
}
- break;
+
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ tcp_synq_drop(sk, req, prev);
+ goto out;
+
case TCP_SYN_SENT:
case TCP_SYN_RECV: /* Cannot happen.
It can f.e. if SYNs crossed.
@@ -1110,10 +992,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
if (sk->lock.users == 0) {
TCP_INC_STATS_BH(TcpAttemptFails);
sk->err = err;
- /* Wake people up to see the error (see connect in sock.c) */
+
sk->error_report(sk);
- tcp_set_state(sk, TCP_CLOSE);
tcp_done(sk);
} else {
sk->err_soft = err;
@@ -1270,28 +1151,23 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
- tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
+ tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
+ tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
tcp_tw_put(tw);
}
static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
{
- tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
+ tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
+ req->ts_recent);
}
-/*
- * Send a SYN-ACK after having received an ACK.
- * This still operates on a open_request only, not on a big
- * socket.
- */
-static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
+static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
{
struct rtable *rt;
struct ip_options *opt;
- struct sk_buff * skb;
- /* First, grab a route. */
opt = req->af.v4_req.opt;
if(ip_route_output(&rt, ((opt && opt->srr) ?
opt->faddr :
@@ -1300,15 +1176,33 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
sk->bound_dev_if)) {
IP_INC_STATS_BH(IpOutNoRoutes);
- return;
+ return NULL;
}
- if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
ip_rt_put(rt);
IP_INC_STATS_BH(IpOutNoRoutes);
- return;
+ return NULL;
}
+ return &rt->u.dst;
+}
+
+/*
+ * Send a SYN-ACK after having received an ACK.
+ * This still operates on a open_request only, not on a big
+ * socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+ struct dst_entry *dst)
+{
+ int err = -1;
+ struct sk_buff * skb;
- skb = tcp_make_synack(sk, &rt->u.dst, req);
+ /* First, grab a route. */
+ if (dst == NULL &&
+ (dst = tcp_v4_route_req(sk, req)) == NULL)
+ goto out;
+
+ skb = tcp_make_synack(sk, dst, req);
if (skb) {
struct tcphdr *th = skb->h.th;
@@ -1317,10 +1211,15 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
csum_partial((char *)th, skb->len, skb->csum));
- ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
- req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+ err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+ req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+ if (err == NET_XMIT_CN)
+ err = 0;
}
- ip_rt_put(rt);
+
+out:
+ dst_release(dst);
+ return err;
}
/*
@@ -1328,7 +1227,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
*/
static void tcp_v4_or_free(struct open_request *req)
{
- if(!req->sk && req->af.v4_req.opt)
+ if (req->af.v4_req.opt)
kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
}
@@ -1372,8 +1271,14 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
* It would be better to replace it with a global counter for all sockets
* but then some measure against one socket starving all other sockets
* would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
*/
-int sysctl_max_syn_backlog = 128;
+int sysctl_max_syn_backlog = 256;
struct or_calltable or_ipv4 = {
PF_INET,
@@ -1383,9 +1288,6 @@ struct or_calltable or_ipv4 = {
tcp_v4_send_reset
};
-#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
-#define BACKLOGMAX(sk) sysctl_max_syn_backlog
-
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt tp;
@@ -1394,6 +1296,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__u32 saddr = skb->nh.iph->saddr;
__u32 daddr = skb->nh.iph->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
+ struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
@@ -1405,84 +1308,108 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
(RTCF_BROADCAST|RTCF_MULTICAST))
goto drop;
- /* XXX: Check against a global syn pool counter. */
- if (BACKLOG(sk) > BACKLOGMAX(sk)) {
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if (tcp_synq_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies && !isn) {
- syn_flood_warning(skb);
+ if (sysctl_tcp_syncookies) {
want_cookie = 1;
} else
#endif
goto drop;
- } else {
- if (isn == 0)
- isn = tcp_v4_init_sequence(sk, skb);
- BACKLOG(sk)++;
}
- req = tcp_openreq_alloc();
- if (req == NULL) {
- goto dropbacklog;
- }
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ goto drop;
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req = tcp_openreq_alloc();
+ if (req == NULL)
+ goto drop;
- req->rcv_isn = TCP_SKB_CB(skb)->seq;
tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-
tp.mss_clamp = 536;
tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
tcp_parse_options(NULL, th, &tp, want_cookie);
- req->mss = tp.mss_clamp;
- req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
- req->tstamp_ok = tp.tstamp_ok;
- req->sack_ok = tp.sack_ok;
- req->snd_wscale = tp.snd_wscale;
- req->wscale_ok = tp.wscale_ok;
- req->rmt_port = th->source;
+ tcp_openreq_init(req, &tp, skb);
+
req->af.v4_req.loc_addr = daddr;
req->af.v4_req.rmt_addr = saddr;
+ req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+ req->class = &or_ipv4;
- /* Note that we ignore the isn passed from the TIME_WAIT
- * state here. That's the price we pay for cookies.
- *
- * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
- * and should reject connection attempt, duplicates with random
- * sequence number can corrupt data. Right?
- * I disabled sending cookie to request matching to a timewait
- * bucket.
- */
- if (want_cookie)
+ if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+ syn_flood_warning(skb);
+#endif
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ } else if (isn == 0) {
+ struct inet_peer *peer = NULL;
- req->snt_isn = isn;
-
- req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tp.saw_tstamp &&
+ sysctl_tcp_tw_recycle &&
+ (dst = tcp_v4_route_req(sk, req)) != NULL &&
+ (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
+ peer->v4daddr == saddr) {
+ if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+ (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
+ NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source)));
+ NET_INC_STATS_BH(PAWSPassiveRejected);
+ dst_release(dst);
+ goto drop_and_free;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - tcp_synq_len(sk)
+ < (sysctl_max_syn_backlog>>2)) &&
+ (!peer || !peer->tcp_ts_stamp) &&
+ (!dst || !dst->rtt)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations, proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source)));
+ TCP_INC_STATS_BH(TcpAttemptFails);
+ dst_release(dst);
+ goto drop_and_free;
+ }
- req->class = &or_ipv4;
- req->retrans = 0;
- req->sk = NULL;
+ isn = tcp_v4_init_sequence(sk, skb);
+ }
+ req->snt_isn = isn;
- tcp_v4_send_synack(sk, req);
+ if (tcp_v4_send_synack(sk, req, dst))
+ goto drop_and_free;
if (want_cookie) {
- if (req->af.v4_req.opt)
- kfree(req->af.v4_req.opt);
- tcp_v4_or_free(req);
tcp_openreq_free(req);
} else {
- req->expires = jiffies + TCP_TIMEOUT_INIT;
- tcp_inc_slow_timer(TCP_SLT_SYNACK);
- tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
+ tcp_v4_synq_add(sk, req);
}
-
return 0;
-dropbacklog:
- if (!want_cookie)
- BACKLOG(sk)--;
+drop_and_free:
+ tcp_openreq_free(req);
drop:
TCP_INC_STATS_BH(TcpAttemptFails);
return 0;
@@ -1497,29 +1424,20 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct open_request *req,
struct dst_entry *dst)
{
- struct ip_options *opt = req->af.v4_req.opt;
struct tcp_opt *newtp;
struct sock *newsk;
- if (sk->ack_backlog > sk->max_ack_backlog)
- goto exit; /* head drop */
- if (dst == NULL) {
- struct rtable *rt;
-
- if (ip_route_output(&rt,
- opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
- req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
- return NULL;
- dst = &rt->u.dst;
- }
+ if (tcp_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ if (dst == NULL &&
+ (dst = tcp_v4_route_req(sk, req)) == NULL)
+ goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
- sk->tp_pinfo.af_tcp.syn_backlog--;
- sk->ack_backlog++;
-
newsk->dst_cache = dst;
newtp = &(newsk->tp_pinfo.af_tcp);
@@ -1527,7 +1445,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->saddr = req->af.v4_req.loc_addr;
newsk->rcv_saddr = req->af.v4_req.loc_addr;
newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
- newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
+ req->af.v4_req.opt = NULL;
+ newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
newtp->ext_header_len = 0;
if (newsk->protinfo.af_inet.opt)
@@ -1535,28 +1454,26 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
tcp_sync_mss(newsk, dst->pmtu);
tcp_initialize_rcv_mss(newsk);
+ newtp->advmss = dst->advmss;
- if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
- newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
- if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
- newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
+ tcp_init_buffer_space(newsk);
- bh_lock_sock(newsk);
-
__tcp_v4_hash(newsk);
__tcp_inherit_port(sk, newsk);
return newsk;
+exit_overflow:
+ NET_INC_STATS_BH(ListenOverflows);
exit:
+ NET_INC_STATS_BH(ListenDrops);
dst_release(dst);
return NULL;
}
-
static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
{
- struct open_request *req, *prev;
+ struct open_request *req, **prev;
struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1565,6 +1482,25 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
if (req)
return tcp_check_req(sk, skb, req, prev);
+ if (tp->accept_queue) {
+ struct sock *nsk;
+
+ nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+ th->source,
+ skb->nh.iph->daddr,
+ ntohs(th->dest),
+ tcp_v4_iif(skb));
+
+ if (nsk) {
+ if (nsk->state != TCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ tcp_tw_put((struct tcp_tw_bucket*)sk);
+ return NULL;
+ }
+ }
+
#ifdef CONFIG_SYN_COOKIES
if (!th->rst && (th->syn || th->ack))
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
@@ -1572,27 +1508,26 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
return sk;
}
-static int tcp_csum_verify(struct sk_buff *skb)
+static int tcp_v4_checksum_init(struct sk_buff *skb)
{
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
- case CHECKSUM_HW:
- if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
- NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
- "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
- "len=%d/%d\n",
- NIPQUAD(skb->nh.iph->saddr),
- ntohs(skb->h.th->source),
- NIPQUAD(skb->nh.iph->daddr),
- ntohs(skb->h.th->dest),
- skb->len,
- ntohs(skb->nh.iph->tot_len)));
- return 1;
+ if (skb->ip_summed == CHECKSUM_HW) {
+ if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+ skb->nh.iph->daddr,skb->csum)) {
+ NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+ return -1;
}
skb->ip_summed = CHECKSUM_UNNECESSARY;
- default:
- /* CHECKSUM_UNNECESSARY */
+ } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if (skb->len <= 68) {
+ if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+ skb->nh.iph->daddr,
+ csum_partial((char *)skb->h.th, skb->len, 0)))
+ return -1;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else {
+ skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+ skb->nh.iph->daddr,0);
+ }
}
return 0;
}
@@ -1614,66 +1549,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto discard;
#endif /* CONFIG_FILTER */
- /*
- * This doesn't check if the socket has enough room for the packet.
- * Either process the packet _without_ queueing it and then free it,
- * or do the check later.
- */
- skb_set_owner_r(skb, sk);
+ IP_INC_STATS_BH(IpInDelivers);
if (sk->state == TCP_ESTABLISHED) { /* Fast path */
- /* Ready to move deeper ... */
- if (tcp_csum_verify(skb))
- goto csum_err;
+ TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
goto reset;
+ TCP_CHECK_TIMER(sk);
return 0;
- }
+ }
- if (tcp_csum_verify(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->state == TCP_LISTEN) {
- struct sock *nsk;
-
- nsk = tcp_v4_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
- /*
- * Queue it on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket..
- */
if (nsk != sk) {
- int ret;
- int state = nsk->state;
-
- skb_orphan(skb);
-
- BUG_TRAP(nsk->lock.users == 0);
- skb_set_owner_r(skb, nsk);
- ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
-
- /* Wakeup parent, send SIGIO, if this packet changed
- socket state from SYN-RECV.
-
- It still looks ugly, however it is much better
- than miracleous double wakeup in syn_recv_sock()
- and tcp_rcv_state_process().
- */
- if (state == TCP_SYN_RECV && nsk->state != state)
- sk->data_ready(sk, 0);
-
- bh_unlock_sock(nsk);
- if (ret)
+ if (tcp_child_process(sk, nsk, skb))
goto reset;
return 0;
}
}
-
+
+ TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
goto reset;
+ TCP_CHECK_TIMER(sk);
return 0;
reset:
@@ -1716,6 +1620,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
if (len < sizeof(struct tcphdr))
goto bad_packet;
+ if (tcp_v4_checksum_init(skb) < 0)
+ goto bad_packet;
+
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
len - th->doff*4);
@@ -1724,7 +1631,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
skb->used = 0;
sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+ skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1738,9 +1645,10 @@ process:
bh_lock_sock(sk);
ret = 0;
- if (!sk->lock.users)
- ret = tcp_v4_do_rcv(sk, skb);
- else
+ if (!sk->lock.users) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
@@ -1749,7 +1657,7 @@ process:
return ret;
no_tcp_socket:
- if (tcp_csum_verify(skb)) {
+ if (tcp_checksum_complete(skb)) {
bad_packet:
TCP_INC_STATS_BH(TcpInErrs);
} else {
@@ -1766,7 +1674,7 @@ discard_and_relse:
goto discard_it;
do_time_wait:
- if (tcp_csum_verify(skb)) {
+ if (tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TcpInErrs);
goto discard_and_relse;
}
@@ -1776,7 +1684,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+ sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
if (sk2 != NULL) {
tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
tcp_timewait_kill((struct tcp_tw_bucket *)sk);
@@ -1796,36 +1704,39 @@ do_time_wait:
goto discard_it;
}
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
static void __tcp_v4_rehash(struct sock *sk)
{
- struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
- struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
- struct sock **skp = &head->chain;
-
- write_lock_bh(&oldhead->lock);
- if(sk->pprev) {
- if(sk->next)
- sk->next->pprev = sk->pprev;
- *sk->pprev = sk->next;
- sk->pprev = NULL;
- }
- write_unlock(&oldhead->lock);
- write_lock(&head->lock);
- if((sk->next = *skp) != NULL)
- (*skp)->pprev = &sk->next;
- *skp = sk;
- sk->pprev = skp;
- write_unlock_bh(&head->lock);
+ sk->prot->unhash(sk);
+ sk->prot->hash(sk);
}
int tcp_v4_rebuild_header(struct sock *sk)
{
- struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
__u32 new_saddr;
int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
- if(rt == NULL)
- return 0;
+ if (rt == NULL) {
+ int err;
+
+ u32 daddr = sk->daddr;
+
+ if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
+ daddr = sk->protinfo.af_inet.opt->faddr;
+
+ err = ip_route_output(&rt, daddr, sk->saddr,
+ RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
+ sk->bound_dev_if);
+ if (err) {
+ sk->err_soft=-err;
+ sk->error_report(sk);
+ return -1;
+ }
+ __sk_dst_set(sk, &rt->u.dst);
+ }
/* Force route checking if want_rewrite.
* The idea is good, the implementation is disguisting.
@@ -1855,16 +1766,6 @@ int tcp_v4_rebuild_header(struct sock *sk)
dst_release(&new_rt->u.dst);
}
}
- if (rt->u.dst.obsolete) {
- int err;
- err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
- if (err) {
- sk->err_soft=-err;
- sk->error_report(sk);
- return -1;
- }
- __sk_dst_set(sk, &rt->u.dst);
- }
return 0;
@@ -1877,7 +1778,7 @@ do_rewrite:
"saddr=%08X rcv_saddr=%08X\n",
ntohl(sk->saddr),
ntohl(sk->rcv_saddr));
- return 0;
+ return -1;
}
if (new_saddr != sk->saddr) {
@@ -1895,7 +1796,7 @@ do_rewrite:
* XXX really change the sockets identity after
* XXX it has entered the hashes. -DaveM
*
- * Besides that, it does not check for connetion
+ * Besides that, it does not check for connection
* uniqueness. Wait for troubles.
*/
__tcp_v4_rehash(sk);
@@ -1913,6 +1814,63 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
sin->sin_port = sk->dport;
}
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
+ struct inet_peer *peer = NULL;
+ int release_it = 0;
+
+ if (rt == NULL || rt->rt_dst != sk->daddr) {
+ peer = inet_getpeer(sk->daddr, 1);
+ release_it = 1;
+ } else {
+ if (rt->peer == NULL)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ }
+
+ if (peer) {
+ if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
+ (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+ peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tp->ts_recent_stamp;
+ peer->tcp_ts = tp->ts_recent;
+ }
+ if (release_it)
+ inet_putpeer(peer);
+ return 1;
+ }
+
+ return 0;
+}
+
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+ struct inet_peer *peer = NULL;
+
+ peer = inet_getpeer(tw->daddr, 1);
+
+ if (peer) {
+ if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
+ (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+ peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tw->ts_recent_stamp;
+ peer->tcp_ts = tw->ts_recent;
+ }
+ inet_putpeer(peer);
+ return 1;
+ }
+
+ return 0;
+}
+
struct tcp_func ipv4_specific = {
ip_queue_xmit,
tcp_v4_send_check,
@@ -1920,6 +1878,7 @@ struct tcp_func ipv4_specific = {
tcp_v4_conn_request,
tcp_v4_syn_recv_sock,
tcp_v4_hash_connecting,
+ tcp_v4_remember_stamp,
sizeof(struct iphdr),
ip_setsockopt,
@@ -1937,6 +1896,7 @@ static int tcp_v4_init_sock(struct sock *sk)
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
tp->rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
@@ -1951,19 +1911,14 @@ static int tcp_v4_init_sock(struct sock *sk)
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
- tp->snd_cwnd_cnt = 0;
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
sk->state = TCP_CLOSE;
- sk->max_ack_backlog = SOMAXCONN;
sk->write_space = tcp_write_space;
- /* Init SYN queue. */
- tcp_synq_init(tp);
-
sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
return 0;
@@ -1981,9 +1936,10 @@ static int tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefuly empty, out_of_order_queue. */
__skb_queue_purge(&tp->out_of_order_queue);
- /* Clean up a referenced TCP bind bucket, this only happens if a
- * port is allocated for a socket, but it never fully connects.
- */
+ /* Clean prequeue, it must be empty really */
+ __skb_queue_purge(&tp->ucopy.prequeue);
+
+ /* Clean up a referenced TCP bind bucket. */
if(sk->prev != NULL)
tcp_put_port(sk);
@@ -1993,17 +1949,19 @@ static int tcp_v4_destroy_sock(struct sock *sk)
/* Proc filesystem TCP sock list dumping. */
static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
{
- sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+ int ttd = req->expires - jiffies;
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
i,
- (long unsigned int)req->af.v4_req.loc_addr,
+ req->af.v4_req.loc_addr,
ntohs(sk->sport),
- (long unsigned int)req->af.v4_req.rmt_addr,
+ req->af.v4_req.rmt_addr,
ntohs(req->rmt_port),
TCP_SYN_RECV,
0,0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
- (unsigned long)(req->expires - jiffies),
+ ttd,
req->retrans,
sk->socket ? sk->socket->inode->i_uid : 0,
0, /* non standard timer */
@@ -2017,7 +1975,7 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
{
unsigned int dest, src;
__u16 destp, srcp;
- int timer_active, timer_active1, timer_active2;
+ int timer_active;
unsigned long timer_expires;
struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
@@ -2025,15 +1983,16 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
src = sp->rcv_saddr;
destp = ntohs(sp->dport);
srcp = ntohs(sp->sport);
- timer_active1 = tp->retransmit_timer.prev != NULL;
- timer_active2 = sp->timer.prev != NULL;
timer_active = 0;
timer_expires = (unsigned) -1;
- if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
+ if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) {
timer_active = 1;
timer_expires = tp->retransmit_timer.expires;
+ } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) {
+ timer_active = 4;
+ timer_expires = tp->probe_timer.expires;
}
- if (timer_active2 && sp->timer.expires < timer_expires) {
+ if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) {
timer_active = 2;
timer_expires = sp->timer.expires;
}
@@ -2041,38 +2000,37 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
timer_expires = jiffies;
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u",
i, src, srcp, dest, destp, sp->state,
tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
timer_active, timer_expires-jiffies,
tp->retransmits,
sp->socket ? sp->socket->inode->i_uid : 0,
- 0,
+ tp->probes_out,
sp->socket ? sp->socket->inode->i_ino : 0,
- atomic_read(&sp->refcnt), sp);
+ atomic_read(&sp->refcnt), sp,
+ tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong
+ );
}
static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
{
unsigned int dest, src;
__u16 destp, srcp;
- int slot_dist;
+ int ttd = tw->ttd - jiffies;
+
+ if (ttd < 0)
+ ttd = 0;
dest = tw->daddr;
src = tw->rcv_saddr;
destp = ntohs(tw->dport);
srcp = ntohs(tw->sport);
- slot_dist = tw->death_slot;
- if(slot_dist > tcp_tw_death_row_slot)
- slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
- else
- slot_dist = tcp_tw_death_row_slot - slot_dist;
-
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
- i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
- 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
+ i, src, srcp, dest, destp, tw->substate, 0, 0,
+ 3, ttd, 0, 0, 0, 0,
atomic_read(&tw->refcnt), tw);
}
@@ -2093,6 +2051,8 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
tcp_listen_lock();
for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
struct sock *sk = tcp_listening_hash[i];
+ struct tcp_listen_opt *lopt;
+ int k;
for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
struct open_request *req;
@@ -2112,25 +2072,30 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
}
skip_listen:
- lock_sock(sk);
- for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
- if (req->sk)
- continue;
- if (!TCP_INET_FAMILY(req->class->family))
- continue;
-
- pos += 128;
- if (pos < offset)
- continue;
- get_openreq(sk, req, tmpbuf, num);
- len += sprintf(buffer+len, "%-127s\n", tmpbuf);
- if(len >= length) {
- tcp_listen_unlock();
- release_sock(sk);
- goto out_no_bh;
+ read_lock_bh(&tp->syn_wait_lock);
+ lopt = tp->listen_opt;
+ if (lopt && lopt->qlen != 0) {
+ for (k=0; k<TCP_SYNQ_HSIZE; k++) {
+ for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
+ if (!TCP_INET_FAMILY(req->class->family))
+ continue;
+
+ pos += 128;
+ if (pos < offset)
+ continue;
+ get_openreq(sk, req, tmpbuf, num);
+ len += sprintf(buffer+len, "%-127s\n", tmpbuf);
+ if(len >= length) {
+ read_unlock_bh(&tp->syn_wait_lock);
+ tcp_listen_unlock();
+ goto out_no_bh;
+ }
+ }
}
}
- release_sock(sk);
+ read_unlock_bh(&tp->syn_wait_lock);
+
+ /* Completed requests are in normal socket hash table */
}
}
tcp_listen_unlock();
@@ -2194,28 +2159,24 @@ struct proto tcp_prot = {
tcp_v4_connect, /* connect */
tcp_disconnect, /* disconnect */
tcp_accept, /* accept */
- NULL, /* retransmit */
- tcp_write_wakeup, /* write_wakeup */
- tcp_read_wakeup, /* read_wakeup */
- tcp_poll, /* poll */
tcp_ioctl, /* ioctl */
tcp_v4_init_sock, /* init */
tcp_v4_destroy_sock, /* destroy */
tcp_shutdown, /* shutdown */
tcp_setsockopt, /* setsockopt */
tcp_getsockopt, /* getsockopt */
- tcp_v4_sendmsg, /* sendmsg */
+ tcp_sendmsg, /* sendmsg */
tcp_recvmsg, /* recvmsg */
NULL, /* bind */
tcp_v4_do_rcv, /* backlog_rcv */
tcp_v4_hash, /* hash */
tcp_unhash, /* unhash */
tcp_v4_get_port, /* get_port */
- 128, /* max_header */
- 0, /* retransmits */
"TCP", /* name */
};
+
+
void __init tcp_v4_init(struct net_proto_family *ops)
{
int err;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3d884dda..d6bc8a205 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ * Cacophonix Gaul : draft-minshall-nagle-01
*
*/
@@ -38,75 +39,65 @@
#include <linux/smp_lock.h>
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
-/* Get rid of any delayed acks, we sent one already.. */
-static __inline__ void clear_delayed_acks(struct sock * sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- tp->delayed_acks = 0;
- if(tcp_in_quickack_mode(tp))
- tcp_exit_quickack_mode(tp);
- tcp_clear_xmit_timer(sk, TIME_DACK);
-}
-
static __inline__ void update_send_head(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
+
tp->send_head = tp->send_head->next;
if (tp->send_head == (struct sk_buff *) &sk->write_queue)
tp->send_head = NULL;
}
/* Calculate mss to advertise in SYN segment.
- RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
-
- 1. It is independent of path mtu.
- 2. Ideally, it is maximal possible segment size i.e. 65535-40.
- 3. For IPv4 it is reasonable to calculate it from maximal MTU of
- attached devices, because some buggy hosts are confused by
- large MSS.
- 4. We do not make 3, we advertise MSS, calculated from first
- hop device mtu, but allow to raise it to ip_rt_min_advmss.
- This may be overriden via information stored in routing table.
- 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
- probably even Jumbo".
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ * attached devices, because some buggy hosts are confused by
+ * large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ * hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ * This may be overriden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ * probably even Jumbo".
*/
static __u16 tcp_advertise_mss(struct sock *sk)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct dst_entry *dst = __sk_dst_get(sk);
- int mss;
+ int mss = tp->advmss;
- if (dst) {
+ if (dst && dst->advmss < mss) {
mss = dst->advmss;
- } else {
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ tp->advmss = mss;
+ }
- /* No dst. It is bad. Guess some reasonable value.
- * Actually, this case should not be possible.
- * SANITY.
- */
- BUG_TRAP(dst!=NULL);
+ return (__u16)mss;
+}
- mss = tp->mss_cache;
- mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
- tp->ext_header_len;
+static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ /* If we had a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato)
+ tp->ack.pingpong = 1;
- /* Minimal MSS to include full set of of TCP/IP options
- plus 8 bytes of data. It corresponds to mtu 128.
- */
- if (mss < 88)
- mss = 88;
- }
+ tp->lsndtime = tcp_time_stamp;
+}
- return (__u16)mss;
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tp->last_ack_sent = tp->rcv_nxt;
+ tcp_dec_quickack_mode(tp);
+ tp->ack.pending = 0;
+ tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
}
/* This routine actually transmits TCP packets queued in by
@@ -120,7 +111,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if(skb != NULL) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -128,6 +119,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
int tcp_header_size = tp->tcp_header_len;
struct tcphdr *th;
int sysctl_flags;
+ int err;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
@@ -190,11 +182,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
}
tp->af_specific->send_check(sk, th, skb->len, skb);
- clear_delayed_acks(sk);
- tp->last_ack_sent = tp->rcv_nxt;
+ if (th->ack)
+ tcp_event_ack_sent(sk);
+
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, skb);
+
TCP_INC_STATS(TcpOutSegs);
- tp->af_specific->queue_xmit(skb);
+
+ err = tp->af_specific->queue_xmit(skb);
+ if (err <= 0)
+ return err;
+
+ tcp_enter_cong_avoid(tp);
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
}
+ return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
@@ -202,32 +212,33 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
/* This is the main buffer sending routine. We queue the buffer
* and decide whether to queue or transmit now.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
*/
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+ tp->write_seq = TCP_SKB_CB(skb)->end_seq;
__skb_queue_tail(&sk->write_queue, skb);
- if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+ if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) {
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
- if(!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- } else {
- /* Queue it, remembering where we must start sending. */
- if (tp->send_head == NULL)
- tp->send_head = skb;
- if (!force_queue && tp->packets_out == 0 && !tp->pending) {
- tp->pending = TIME_PROBE0;
- tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) {
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_minshall_update(tp, cur_mss, skb->len);
+ tp->packets_out++;
+ if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return;
}
}
+ /* Queue it, remembering where we must start sending. */
+ if (tp->send_head == NULL)
+ tp->send_head = skb;
}
/* Function to create two new TCP segments. Shrinks the given segment
@@ -243,13 +254,13 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
/* Get a new skb... force flag on. */
buff = sock_wmalloc(sk,
- (nsize + MAX_HEADER + sk->prot->max_header),
+ (nsize + MAX_TCP_HEADER + 15),
1, GFP_ATOMIC);
if (buff == NULL)
- return -1; /* We'll just try again later. */
+ return -ENOMEM; /* We'll just try again later. */
/* Reserve space for headers. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER);
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
@@ -276,8 +287,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
TCP_SKB_CB(buff)->sacked = 0;
/* Copy and checksum data tail into the new buffer. */
- buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
- nsize, 0);
+ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+ nsize, 0);
/* This takes care of the FIN sequence number too. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
@@ -288,6 +299,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
/* Looks stupid, but our code really uses when of
* skbs, which it never sent before. --ANK
+ *
+ * NOTE: several days after I added this, Dave repaired
+ * tcp_simple_retransmit() and it should not use ->when
+ * of never sent skbs more. I am not sure, so that
+ * this line remains until more careful investigation. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
@@ -335,20 +351,19 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)
if (mss_now > tp->mss_clamp)
mss_now = tp->mss_clamp;
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
/* Now subtract optional transport overhead */
mss_now -= tp->ext_header_len;
- /* It we got too small (or even negative) value,
- clamp it by 8 from below. Why 8 ?
- Well, it could be 1 with the same success,
- but if IP accepted segment of length 1,
- it would love 8 even more 8) --ANK (980731)
- */
- if (mss_now < 8)
- mss_now = 8;
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ /* Bound mss with half of window */
+ if (tp->max_window && mss_now > (tp->max_window>>1))
+ mss_now = max((tp->max_window>>1), 1);
/* And store cached results */
tp->pmtu_cookie = pmtu;
@@ -360,27 +375,30 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
*/
-void tcp_write_xmit(struct sock *sk)
+int tcp_write_xmit(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int mss_now;
- /* Account for SACKS, we may need to fragment due to this.
- * It is just like the real MSS changing on us midstream.
- * We also handle things correctly when the user adds some
- * IP options mid-stream. Silly to do, but cover it.
- */
- mss_now = tcp_current_mss(sk);
-
- /* If we are zapped, the bytes will have to remain here.
- * In time closedown will empty the write queue and all
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and all
* will be happy.
*/
- if(!sk->zapped) {
+ if(sk->state != TCP_CLOSE) {
struct sk_buff *skb;
int sent_pkts = 0;
+ /* Account for SACKS, we may need to fragment due to this.
+ * It is just like the real MSS changing on us midstream.
+ * We also handle things correctly when the user adds some
+ * IP options mid-stream. Silly to do, but cover it.
+ */
+ mss_now = tcp_current_mss(sk);
+
/* Anything on the transmit queue that fits the window can
* be added providing we are:
*
@@ -388,27 +406,36 @@ void tcp_write_xmit(struct sock *sk)
* b) not exceeding our congestion window.
* c) not retransmitting [Nagle]
*/
- while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
+ while((skb = tp->send_head) &&
+ tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) {
if (skb->len > mss_now) {
if (tcp_fragment(sk, skb, mss_now))
break;
}
- /* Advance the send_head. This one is going out. */
- update_send_head(sk);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ break;
+ /* Advance the send_head. This one is sent out. */
+ update_send_head(sk);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_minshall_update(tp, mss_now, skb->len);
tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
sent_pkts = 1;
}
/* If we sent anything, make sure the retransmit
* timer is active.
*/
- if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ if (sent_pkts) {
+ if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return 0;
+ }
+
+ return !tp->packets_out && tp->send_head;
}
+ return 0;
}
/* This function returns the amount that we can raise the
@@ -471,7 +498,7 @@ u32 __tcp_select_window(struct sock *sk)
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
- unsigned int mss = tp->rcv_mss;
+ unsigned int mss = tp->ack.rcv_mss;
int free_space;
u32 window;
@@ -481,11 +508,19 @@ u32 __tcp_select_window(struct sock *sk)
free_space = tp->window_clamp;
if (tp->window_clamp < mss)
mss = tp->window_clamp;
-
- if ((free_space < (tcp_full_space(sk) / 2)) &&
+
+ if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) &&
(free_space < ((int) (mss/2)))) {
window = 0;
- tp->pred_flags = 0;
+
+ /* THIS IS _VERY_ GOOD PLACE to play window clamp.
+ * if free_space becomes suspiciously low
+ * verify ratio rmem_alloc/(rcv_nxt - copied_seq),
+ * and if we predict that when free_space will be lower mss,
+ * rmem_alloc will run out of rcvbuf*2, shrink window_clamp.
+ * It will eliminate most of prune events! Very simple,
+ * it is the next thing to do. --ANK
+ */
} else {
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
@@ -542,9 +577,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
/* Optimize, actually we could also combine next_skb->csum
* to skb->csum using a single add w/carry operation too.
*/
- skb->csum = csum_partial_copy(next_skb->data,
- skb_put(skb, next_skb_size),
- next_skb_size, skb->csum);
+ skb->csum = csum_partial_copy_nocheck(next_skb->data,
+ skb_put(skb, next_skb_size),
+ next_skb_size, skb->csum);
}
/* Update sequence range on original skb. */
@@ -603,8 +638,10 @@ void tcp_simple_retransmit(struct sock *sk)
if (old_next_skb != skb || skb->len > mss)
resend_skb = 1;
old_next_skb = skb->next;
- if (resend_skb != 0)
- tcp_retransmit_skb(sk, skb);
+ if (resend_skb != 0) {
+ if (tcp_retransmit_skb(sk, skb))
+ break;
+ }
}
}
@@ -629,9 +666,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int cur_mss = tcp_current_mss(sk);
+#ifdef TCP_DEBUG
+ /* It was possible this summer, that retransmit timer
+ * raced with its deletion and hit socket with packets_out==0.
+ * I fixed it, but preserved the check in the place,
+ * where the fault occured. --ANK
+ */
+ if (skb == NULL) {
+ printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk));
+ return -EFAULT;
+ }
+#endif
+
if(skb->len > cur_mss) {
if(tcp_fragment(sk, skb, cur_mss))
- return 1; /* We'll try again later. */
+ return -ENOMEM; /* We'll try again later. */
/* New SKB created, account for it. */
tp->packets_out++;
@@ -646,7 +695,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tcp_retrans_try_collapse(sk, skb, cur_mss);
if(tp->af_specific->rebuild_header(sk))
- return 1; /* Routing failure or similar. */
+ return -EHOSTUNREACH; /* Routing failure or similar. */
/* Some Solaris stacks overoptimize and ignore the FIN on a
* retransmit when old data is attached. So strip it off
@@ -673,13 +722,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
else
skb = skb_clone(skb, GFP_ATOMIC);
- tcp_transmit_skb(sk, skb);
-
/* Update global TCP statistics and return success. */
- sk->prot->retransmits++;
TCP_INC_STATS(TcpRetransSegs);
- return 0;
+ return tcp_transmit_skb(sk, skb);
}
/* This gets called after a retransmit timeout, and the initially
@@ -774,7 +820,11 @@ void tcp_send_fin(struct sock *sk)
*/
mss_now = tcp_current_mss(sk);
- if((tp->send_head != NULL) && (skb->len < mss_now)) {
+ /* Please, find seven differences of 2.3.33 and loook
+ * what I broke here. 8) --ANK
+ */
+
+ if(tp->send_head != NULL) {
/* tcp_write_xmit() takes care of the rest. */
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
TCP_SKB_CB(skb)->end_seq++;
@@ -783,31 +833,34 @@ void tcp_send_fin(struct sock *sk)
/* Special case to avoid Nagle bogosity. If this
* segment is the last segment, and it was queued
* due to Nagle/SWS-avoidance, send it out now.
+ *
+ * Hmm... actually it overrides also congestion
+ * avoidance (OK for FIN) and retransmit phase
+ * (not OK? Added.).
*/
if(tp->send_head == skb &&
- !sk->nonagle &&
- skb->len < (tp->rcv_mss >> 1) &&
- tp->packets_out &&
- !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
- update_send_head(sk);
+ !after(tp->write_seq, tp->snd_una + tp->snd_wnd) &&
+ !tp->retransmits) {
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- if(!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) {
+ update_send_head(sk);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ } else
+ tcp_check_probe_timer(sk, tp);
}
} else {
/* Socket is locked, keep trying until memory is available. */
do {
skb = sock_wmalloc(sk,
- (MAX_HEADER +
- sk->prot->max_header),
+ MAX_TCP_HEADER + 15,
1, GFP_KERNEL);
} while (skb == NULL);
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
TCP_SKB_CB(skb)->sacked = 0;
@@ -816,7 +869,8 @@ void tcp_send_fin(struct sock *sk)
/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
TCP_SKB_CB(skb)->seq = tp->write_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- tcp_send_skb(sk, skb, 0);
+ tcp_send_skb(sk, skb, 0, mss_now);
+ __tcp_push_pending_frames(sk, tp, mss_now);
}
}
@@ -831,19 +885,19 @@ void tcp_send_active_reset(struct sock *sk, int priority)
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+ skb = alloc_skb(MAX_TCP_HEADER + 15, priority);
if (!skb)
return;
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->urg_ptr = 0;
/* Send it off. */
- TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->seq = tp->snd_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_transmit_skb(sk, skb);
@@ -859,13 +913,13 @@ int tcp_send_synack(struct sock *sk)
struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff* skb;
- skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15,
1, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
TCP_SKB_CB(skb)->sacked = 0;
@@ -877,8 +931,7 @@ int tcp_send_synack(struct sock *sk)
__skb_queue_tail(&sk->write_queue, skb);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- return 0;
+ return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
}
/*
@@ -887,16 +940,17 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct open_request *req)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct tcphdr *th;
int tcp_header_size;
struct sk_buff *skb;
- skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
if (skb == NULL)
return NULL;
/* Reserve space for headers. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->dst = dst_clone(dst);
@@ -919,7 +973,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
- req->window_clamp = skb->dst->window;
+ req->window_clamp = tp->window_clamp ? : skb->dst->window;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -951,7 +1005,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Reserve space for headers. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER + 15);
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -962,12 +1016,16 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->user_mss)
tp->mss_clamp = tp->user_mss;
+ tp->max_window = 0;
tcp_sync_mss(sk, dst->pmtu);
+ tcp_initialize_rcv_mss(sk);
- tp->window_clamp = dst->window;
+ if (!tp->window_clamp)
+ tp->window_clamp = dst->window;
+ tp->advmss = dst->advmss;
tcp_select_initial_window(tcp_full_space(sk),
- dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
+ tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
@@ -982,10 +1040,12 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
goto err_out;
sk->err = 0;
+ sk->done = 0;
tp->snd_wnd = 0;
tp->snd_wl1 = 0;
tp->snd_wl2 = tp->write_seq;
tp->snd_una = tp->write_seq;
+ tp->snd_sml = tp->write_seq;
tp->rcv_nxt = 0;
tp->rcv_wup = 0;
tp->copied_seq = 0;
@@ -1006,13 +1066,14 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tp->syn_stamp = TCP_SKB_CB(buff)->when;
__skb_queue_tail(&sk->write_queue, buff);
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
TCP_INC_STATS(TcpActiveOpens);
/* Timer for repeating the SYN until an answer. */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
return 0;
err_out:
@@ -1025,16 +1086,14 @@ err_out:
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
-void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
unsigned long timeout;
/* Stay within the limit we were given */
- timeout = (tp->ato << 1) >> 1;
- if (timeout > max_timeout)
- timeout = max_timeout;
- timeout += jiffies;
+ timeout = tp->ack.ato;
+ timeout += jiffies + (timeout>>2);
/* Use new timeout only if there wasn't a older one earlier. */
spin_lock_bh(&sk->timer_lock);
@@ -1042,18 +1101,46 @@ void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
sock_hold(sk);
tp->delack_timer.expires = timeout;
} else {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ */
+ if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) {
+ spin_unlock_bh(&sk->timer_lock);
+
+ tcp_send_ack(sk);
+ __sock_put(sk);
+ return;
+ }
+
if (time_before(timeout, tp->delack_timer.expires))
tp->delack_timer.expires = timeout;
}
add_timer(&tp->delack_timer);
spin_unlock_bh(&sk->timer_lock);
+
+#ifdef TCP_FORMAL_WINDOW
+ /* Explanation. Header prediction path does not handle
+ * case of zero window. If we send ACK immediately, pred_flags
+ * are reset when sending ACK. If rcv_nxt is advanced and
+ * ack is not sent, than delayed ack is scheduled.
+ * Hence, it is the best place to check for zero window.
+ */
+ if (tp->pred_flags) {
+ if (tcp_receive_window(tp) == 0)
+ tp->pred_flags = 0;
+ } else {
+ if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+ !tp->urg_data)
+ tcp_fast_path_on(tp);
+ }
+#endif
}
/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
- if(!sk->zapped) {
+ if(sk->state != TCP_CLOSE) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *buff;
@@ -1061,29 +1148,15 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (buff == NULL) {
- /* Force it to send an ack. We don't have to do this
- * (ACK is unreliable) but it's much better use of
- * bandwidth on slow links to send a spare ack than
- * resend packets.
- *
- * This is the one possible way that we can delay an
- * ACK and have tp->ato indicate that we are in
- * quick ack mode, so clear it. It is also the only
- * possible way for ato to be zero, when ACK'ing a
- * SYNACK because we've taken no ATO measurement yet.
- */
- if (tcp_in_quickack_mode(tp))
- tcp_exit_quickack_mode(tp);
- if (!tp->ato)
- tp->ato = tp->rto;
- tcp_send_delayed_ack(sk, HZ/2);
+ tp->ack.pending = 1;
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
return;
}
/* Reserve space for headers and prepare control bits. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER);
buff->csum = 0;
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(buff)->sacked = 0;
@@ -1099,24 +1172,20 @@ void tcp_send_ack(struct sock *sk)
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
*/
-void tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk)
{
- /* After a valid reset we can send no more. */
- if (!sk->zapped) {
+ if (sk->state != TCP_CLOSE) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
- /* Write data can still be transmitted/retransmitted in the
- * following states. If any other state is encountered, return.
- * [listen/close will never occur here anyway]
+ /* Now this function is never called, while
+ * we have something not ACKed in queue.
*/
- if ((1 << sk->state) &
- ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
- TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
- return;
+ BUG_TRAP(tp->snd_una == tp->snd_nxt);
- if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
- ((skb = tp->send_head) != NULL)) {
+ if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una)
+ && ((skb = tp->send_head) != NULL)) {
+ int err;
unsigned long win_size;
/* We are probing the opening of a window
@@ -1126,24 +1195,26 @@ void tcp_write_wakeup(struct sock *sk)
win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
if (tcp_fragment(sk, skb, win_size))
- return; /* Let a retransmit get it. */
+ return -1;
}
- update_send_head(sk);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- if (!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if (!err) {
+ update_send_head(sk);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ }
+ return err;
} else {
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
- GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (skb == NULL)
- return;
+ return -1;
/* Reserve space for headers and set control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(skb)->sacked = 0;
@@ -1152,13 +1223,18 @@ void tcp_write_wakeup(struct sock *sk)
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
+ *
+ * RED-PEN: logically it should be snd_una-1.
+ * snd_nxt-1 will not be acked. snd_una==snd_nxt
+ * in this place however. Right?
*/
- TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
+ TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, skb);
+ return tcp_transmit_skb(sk, skb);
}
}
+ return -1;
}
/* A window probe timeout has occurred. If window is not closed send
@@ -1167,11 +1243,32 @@ void tcp_write_wakeup(struct sock *sk)
void tcp_send_probe0(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int err;
+
+ err = tcp_write_wakeup(sk);
+
+ if (tp->packets_out || !tp->send_head) {
+ /* Cancel probe timer, if it is not required. */
+ tp->probes_out = 0;
+ tp->backoff = 0;
+ return;
+ }
- tcp_write_wakeup(sk);
- tp->pending = TIME_PROBE0;
- tp->backoff++;
- tp->probes_out++;
- tcp_reset_xmit_timer (sk, TIME_PROBE0,
- min(tp->rto << tp->backoff, 120*HZ));
+ if (err <= 0) {
+ tp->backoff++;
+ tp->probes_out++;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ } else {
+ /* If packet was not sent due to local congestion,
+ * do not backoff and do not remember probes_out.
+ * Let local senders to fight for local resources.
+ *
+ * Use accumulated backoff yet.
+ */
+ if (!tp->probes_out)
+ tp->probes_out=1;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ }
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a38724e42..bff4e872f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $
+ * Version: $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -23,29 +23,20 @@
#include <net/tcp.h>
int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES;
-
-static void tcp_sltimer_handler(unsigned long);
-static void tcp_syn_recv_timer(unsigned long);
+static void tcp_retransmit_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_probe_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
static void tcp_twkill(unsigned long);
-struct timer_list tcp_slow_timer = {
- NULL, NULL,
- 0, 0,
- tcp_sltimer_handler,
-};
-
-
-struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
- {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
- {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */
-};
-
const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
/*
@@ -56,17 +47,25 @@ const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
void tcp_init_xmit_timers(struct sock *sk)
{
- init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
- sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
- sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
-
- init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
- sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
- sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
-
- init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
- sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
- sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ spin_lock_init(&sk->timer_lock);
+
+ init_timer(&tp->retransmit_timer);
+ tp->retransmit_timer.function=&tcp_retransmit_timer;
+ tp->retransmit_timer.data = (unsigned long) sk;
+
+ init_timer(&tp->delack_timer);
+ tp->delack_timer.function=&tcp_delack_timer;
+ tp->delack_timer.data = (unsigned long) sk;
+
+ init_timer(&tp->probe_timer);
+ tp->probe_timer.function=&tcp_probe_timer;
+ tp->probe_timer.data = (unsigned long) sk;
+
+ init_timer(&sk->timer);
+ sk->timer.function=&tcp_keepalive_timer;
+ sk->timer.data = (unsigned long) sk;
}
/*
@@ -79,7 +78,7 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
spin_lock_bh(&sk->timer_lock);
switch (what) {
- case TIME_RETRANS:
+ case TCP_TIME_RETRANS:
/* When seting the transmit timer the probe timer
* should not be set.
* The delayed ack timer can be set if we are changing the
@@ -89,29 +88,25 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
__sock_put(sk);
if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
sock_hold(sk);
- if (when > 120*HZ) {
+ if (when > TCP_RTO_MAX) {
printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
- when = 120*HZ;
+ when = TCP_RTO_MAX;
}
mod_timer(&tp->retransmit_timer, jiffies+when);
break;
- case TIME_DACK:
+ case TCP_TIME_DACK:
if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
sock_hold(sk);
mod_timer(&tp->delack_timer, jiffies+when);
break;
- case TIME_PROBE0:
+ case TCP_TIME_PROBE0:
if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
sock_hold(sk);
mod_timer(&tp->probe_timer, jiffies+when);
break;
- case TIME_WRITE:
- printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
- break;
-
default:
printk(KERN_DEBUG "bug: unknown timer value\n");
};
@@ -127,6 +122,7 @@ void tcp_clear_xmit_timers(struct sock *sk)
__sock_put(sk);
if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
__sock_put(sk);
+ tp->ack.blocked = 0;
if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
__sock_put(sk);
if(sk->timer.prev && del_timer(&sk->timer))
@@ -134,39 +130,33 @@ void tcp_clear_xmit_timers(struct sock *sk)
spin_unlock_bh(&sk->timer_lock);
}
-static void tcp_write_err(struct sock *sk, int force)
+static void tcp_write_err(struct sock *sk)
{
- sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
+ sk->err = sk->err_soft ? : ETIMEDOUT;
sk->error_report(sk);
- tcp_clear_xmit_timers(sk);
-
- /* Do not time wait the socket. It is timed out and, hence,
- * idle for 120*HZ. "force" argument is ignored, delete
- * it eventually.
- */
-
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
tcp_done(sk);
}
/* A write timeout has occurred. Process the after effects. */
-static void tcp_write_timeout(struct sock *sk)
+static int tcp_write_timeout(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int retry_until;
- /* Look for a 'soft' timeout. */
- if ((sk->state == TCP_ESTABLISHED &&
- tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
- (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
- hole detection. :-(
+ if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
+ if (tp->retransmits)
+ dst_negative_advice(&sk->dst_cache);
+ retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ } else {
+ if (tp->retransmits >= sysctl_tcp_retries1) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+ hole detection. :-(
- It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
"The one security concern raised by this memo is that ICMP black holes
are often caused by over-zealous security administrators who block
@@ -177,57 +167,70 @@ static void tcp_write_timeout(struct sock *sk)
be far nicer to have all of the black holes fixed rather than fixing
all of the TCP implementations."
- Golden words :-).
- */
+ Golden words :-).
+ */
- dst_negative_advice(&sk->dst_cache);
+ dst_negative_advice(&sk->dst_cache);
+ }
+ retry_until = sysctl_tcp_retries2;
+ if (sk->dead)
+ retry_until = sysctl_tcp_orphan_retries;
}
-
- /* Have we tried to SYN too many times (repent repent 8)) */
- if (sk->state == TCP_SYN_SENT &&
- ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) ||
- (tp->syn_retries && tp->retransmits > tp->syn_retries))) {
- tcp_write_err(sk, 1);
- /* Don't FIN, we got nothing back */
- } else if (tp->retransmits > sysctl_tcp_retries2) {
+
+ if (tp->retransmits >= retry_until) {
/* Has it gone just too far? */
- tcp_write_err(sk, 0);
+ tcp_write_err(sk);
+ return 1;
}
+ return 0;
}
-void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
- tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5);
+ tp->ack.blocked = 1;
+ NET_INC_STATS_BH(DelayedACKLocked);
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
goto out_unlock;
}
- if(!sk->zapped &&
- sk->tp_pinfo.af_tcp.delayed_acks &&
- sk->state != TCP_CLOSE)
+ if (tp->ack.pending) {
+ /* Delayed ACK missed: inflate ATO, leave pingpong mode */
+ tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX);
+ tp->ack.pingpong = 0;
tcp_send_ack(sk);
+ NET_INC_STATS_BH(DelayedACKs);
+ }
+ TCP_CHECK_TIMER(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
}
-void tcp_probe_timer(unsigned long data)
+static void tcp_probe_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- if(sk->zapped)
- goto out;
+ int max_probes;
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
- tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+ tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5);
+ goto out_unlock;
+ }
+
+ if (sk->state == TCP_CLOSE)
+ goto out_unlock;
+
+ if (tp->packets_out || !tp->send_head) {
+ tp->probes_out = 0;
goto out_unlock;
}
@@ -246,151 +249,251 @@ void tcp_probe_timer(unsigned long data)
* with RFCs, only probe timer combines both retransmission timeout
* and probe timeout in one bottle. --ANK
*/
- if (tp->probes_out > sysctl_tcp_retries2) {
- tcp_write_err(sk, 0);
+ max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2;
+
+ if (tp->probes_out > max_probes) {
+ tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
+ TCP_CHECK_TIMER(sk);
}
out_unlock:
bh_unlock_sock(sk);
-out:
sock_put(sk);
}
/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-int tcp_tw_death_row_slot = 0;
-static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
- { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
-static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static int tcp_tw_death_row_slot = 0;
+int tcp_tw_count = 0;
+static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static struct timer_list tcp_tw_timer = { function: tcp_twkill };
static void tcp_twkill(unsigned long data)
{
struct tcp_tw_bucket *tw;
int killed = 0;
- /* The death-row tw chains are only ever touched
- * in BH context so no BH disabling (for now) is needed.
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
*/
spin_lock(&tw_death_lock);
- tw = tcp_tw_death_row[tcp_tw_death_row_slot];
- tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
- spin_unlock(&tw_death_lock);
- while(tw != NULL) {
- struct tcp_tw_bucket *next = tw->next_death;
+ if (tcp_tw_count == 0)
+ goto out;
+
+ while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
+ tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
+ tw->pprev_death = NULL;
+ spin_unlock(&tw_death_lock);
tcp_timewait_kill(tw);
tcp_tw_put(tw);
+
killed++;
- tw = next;
- }
- if(killed != 0) {
- struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
- atomic_sub(killed, &slt->count);
+
+ spin_lock(&tw_death_lock);
}
+ tcp_tw_death_row_slot =
+ ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+
+ if ((tcp_tw_count -= killed) != 0)
+ mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+ net_statistics[smp_processor_id()*2].TimeWaited += killed;
+out:
+ spin_unlock(&tw_death_lock);
}
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
*/
-void tcp_tw_schedule(struct tcp_tw_bucket *tw)
-{
- struct tcp_tw_bucket **tpp;
- int slot;
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
spin_lock(&tw_death_lock);
- slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- tpp = &tcp_tw_death_row[slot];
- if((tw->next_death = *tpp) != NULL)
- (*tpp)->pprev_death = &tw->next_death;
- *tpp = tw;
- tw->pprev_death = tpp;
-
- tw->death_slot = slot;
- atomic_inc(&tw->refcnt);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ tcp_tw_put(tw);
+ if (--tcp_tw_count == 0)
+ del_timer(&tcp_tw_timer);
+ }
spin_unlock(&tw_death_lock);
-
- tcp_inc_slow_timer(TCP_SLT_TWKILL);
}
-/* Happens rarely if at all, no care about scalability here. */
-void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
+/* Short-time timewait calendar */
+
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,};
+static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+
+void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
{
struct tcp_tw_bucket **tpp;
int slot;
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+
spin_lock(&tw_death_lock);
+
+ /* Unlink it, if it was scheduled */
if (tw->pprev_death) {
if(tw->next_death)
tw->next_death->pprev_death = tw->pprev_death;
*tw->pprev_death = tw->next_death;
tw->pprev_death = NULL;
+ tcp_tw_count--;
} else
atomic_inc(&tw->refcnt);
- slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- tpp = &tcp_tw_death_row[slot];
+ if (slot >= TCP_TW_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= TCP_TIMEWAIT_LEN) {
+ slot = TCP_TWKILL_SLOTS-1;
+ } else {
+ slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+ if (slot >= TCP_TWKILL_SLOTS)
+ slot = TCP_TWKILL_SLOTS-1;
+ }
+ tw->ttd = jiffies + timeo;
+ slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+ tpp = &tcp_tw_death_row[slot];
+ } else {
+ tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
+
+ if (tcp_twcal_hand < 0) {
+ tcp_twcal_hand = 0;
+ tcp_twcal_jiffie = jiffies;
+ tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+ add_timer(&tcp_twcal_timer);
+ } else {
+ if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
+ mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+ slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+ }
+ tpp = &tcp_twcal_row[slot];
+ }
+
if((tw->next_death = *tpp) != NULL)
(*tpp)->pprev_death = &tw->next_death;
*tpp = tw;
tw->pprev_death = tpp;
- tw->death_slot = slot;
+ if (tcp_tw_count++ == 0)
+ mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
spin_unlock(&tw_death_lock);
-
- /* Timer was incremented when we first entered the table. */
}
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+void tcp_twcal_tick(unsigned long dummy)
{
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
spin_lock(&tw_death_lock);
- if (tw->pprev_death) {
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
- tcp_tw_put(tw);
+ if (tcp_twcal_hand < 0)
+ goto out;
+
+ slot = tcp_twcal_hand;
+ j = tcp_twcal_jiffie;
+
+ for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+ if ((long)(j - now) <= 0) {
+ struct tcp_tw_bucket *tw;
+
+ while((tw = tcp_twcal_row[slot]) != NULL) {
+ tcp_twcal_row[slot] = tw->next_death;
+ tw->pprev_death = NULL;
+
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ tcp_twcal_jiffie = j;
+ tcp_twcal_hand = slot;
+ }
+
+ if (tcp_twcal_row[slot] != NULL) {
+ mod_timer(&tcp_twcal_timer, j);
+ goto out;
+ }
+ }
+ j += (1<<TCP_TW_RECYCLE_TICK);
+ slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
}
- spin_unlock(&tw_death_lock);
+ tcp_twcal_hand = -1;
- tcp_dec_slow_timer(TCP_SLT_TWKILL);
+out:
+ if ((tcp_tw_count -= killed) == 0)
+ del_timer(&tcp_tw_timer);
+ net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
+ spin_unlock(&tw_death_lock);
}
/*
* The TCP retransmit timer.
- *
- * 1. An initial rtt timeout on the probe0 should cause what we can
- * of the first write queue buffer to be split and sent.
- * 2. On a 'major timeout' as defined by RFC1122 we do not report
- * ETIMEDOUT if we know an additional 'soft' error caused this.
- * tcp_err saves a 'soft error' for us.
*/
-void tcp_retransmit_timer(unsigned long data)
+static void tcp_retransmit_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- /* We are reset. We will send no more retransmits. */
- if(sk->zapped)
- goto out;
-
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20);
goto out_unlock;
}
- /* Clear delay ack timer. */
- tcp_clear_xmit_timer(sk, TIME_DACK);
+ if (sk->state == TCP_CLOSE || tp->packets_out == 0)
+ goto out_unlock;
+
+ BUG_TRAP(!skb_queue_empty(&sk->write_queue));
+
+ if (tcp_write_timeout(sk))
+ goto out_unlock;
/* RFC 2018, clear all 'sacked' flags in retransmission queue,
* the sender may have dropped out of order frames and we must
@@ -426,11 +529,19 @@ void tcp_retransmit_timer(unsigned long data)
tp->snd_cwnd = 1;
}
- tp->retransmits++;
-
tp->dup_acks = 0;
tp->high_seq = tp->snd_nxt;
- tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+ if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
+ /* Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (!tp->retransmits)
+ tp->retransmits=1;
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+ min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ TCP_CHECK_TIMER(sk);
+ goto out_unlock;
+ }
/* Increase the timeout each time we retransmit. Note that
* we do not increase the rtt estimate. rto is initialized
@@ -448,132 +559,105 @@ void tcp_retransmit_timer(unsigned long data)
* the 120 second clamps though!
*/
tp->backoff++;
- tp->rto = min(tp->rto << 1, 120*HZ);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-
- tcp_write_timeout(sk);
+ tp->retransmits++;
+ tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ TCP_CHECK_TIMER(sk);
out_unlock:
bh_unlock_sock(sk);
-out:
sock_put(sk);
}
/*
- * Slow timer for SYN-RECV sockets
+ * Timer for listening sockets
*/
-static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
-{
- struct open_request *prev, *req;
-
- prev = (struct open_request *) &tp->syn_wait_queue;
- for(req = tp->syn_wait_queue; req; ) {
- struct open_request *next = req->dl_next;
-
- if (!req->sk && (long)(now - req->expires) >= 0) {
- tcp_synq_unlink(tp, req, prev);
- if(req->retrans >= sysctl_tcp_retries1) {
- (*req->class->destructor)(req);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- tp->syn_backlog--;
- tcp_openreq_free(req);
- if (! tp->syn_wait_queue)
- break;
- } else {
- unsigned long timeo;
- struct open_request *rp;
-
- (*req->class->rtx_syn_ack)(sk, req);
- req->retrans++;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- (120 * HZ));
- req->expires = now + timeo;
- rp = prev->dl_next;
- tcp_synq_queue(tp, req);
- if(rp != prev->dl_next)
- prev = prev->dl_next;
- }
- } else
- prev = req;
- req = next;
- }
-}
-
-/* This now scales very nicely. -DaveM */
-static void tcp_syn_recv_timer(unsigned long data)
+static void tcp_synack_timer(struct sock *sk)
{
- struct sock *sk;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
unsigned long now = jiffies;
- int i;
-
- read_lock(&tcp_lhash_lock);
- for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
- sk = tcp_listening_hash[i];
- while(sk) {
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- /* TCP_LISTEN is implied. */
- bh_lock_sock(sk);
- if (!sk->lock.users && tp->syn_wait_queue)
- tcp_do_syn_queue(sk, tp, now);
- bh_unlock_sock(sk);
- sk = sk->next;
+ struct open_request **reqp, *req;
+ int i, budget;
+
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
+
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
+
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
}
}
- read_unlock(&tcp_lhash_lock);
-}
-
-void tcp_sltimer_handler(unsigned long data)
-{
- struct tcp_sl_timer *slt = tcp_slt_array;
- unsigned long next = ~0UL;
- unsigned long now = jiffies;
- int i;
- for (i=0; i < TCP_SLT_MAX; i++, slt++) {
- if (atomic_read(&slt->count)) {
- long trigger;
-
- trigger = slt->period - ((long)(now - slt->last));
-
- if (trigger <= 0) {
- (*slt->handler)((unsigned long) slt);
- slt->last = now;
- trigger = slt->period;
+ if (tp->defer_accept)
+ max_retries = tp->defer_accept;
+
+ budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+ i = lopt->clock_hand;
+
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if ((long)(now - req->expires) >= 0) {
+ if ((req->retrans < thresh ||
+ (req->acked && req->retrans < max_retries))
+ && !req->class->rtx_syn_ack(sk, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+ TCP_RTO_MAX);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ write_lock(&tp->syn_wait_lock);
+ *reqp = req->dl_next;
+ write_unlock(&tp->syn_wait_lock);
+ lopt->qlen--;
+ if (req->retrans == 0)
+ lopt->qlen_young--;
+ tcp_openreq_free(req);
}
-
- /* Only reschedule if some events remain. */
- if (atomic_read(&slt->count))
- next = min(next, trigger);
+ reqp = &req->dl_next;
}
- }
- if (next != ~0UL)
- mod_timer(&tcp_slow_timer, (now + next));
-}
-/* __tcp_inc_slow_timer is called when an slow timer is started
- * first time (slt->count was 0). There is race condition between
- * timer creation and deletion and if we do not force adding timer here,
- * we might lose timer. We could avoid it with global spinlock, but
- * it is apparently overkill, so that we restart timer ALWAYS when
- * this function is entered, it guarantees that timer will not lost.
- */
+ i = (i+1)&(TCP_SYNQ_HSIZE-1);
-void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
-{
- unsigned long now = jiffies;
- unsigned long when;
+ } while (--budget > 0);
- slt->last = now;
+ lopt->clock_hand = i;
- when = now + slt->period;
-
- if (tcp_slow_timer.prev &&
- (long)(tcp_slow_timer.expires - when) < 0)
- when = tcp_slow_timer.expires;
-
- mod_timer(&tcp_slow_timer, when);
+ if (lopt->qlen)
+ tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
}
void tcp_delete_keepalive_timer (struct sock *sk)
@@ -595,6 +679,9 @@ void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
void tcp_set_keepalive(struct sock *sk, int val)
{
+ if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
+ return;
+
if (val && !sk->keepopen)
tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
else if (!val)
@@ -602,7 +689,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
}
-void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -616,14 +703,31 @@ void tcp_keepalive_timer (unsigned long data)
goto out;
}
- if (sk->state == TCP_FIN_WAIT2 && sk->dead)
+ if (sk->state == TCP_LISTEN) {
+ tcp_synack_timer(sk);
+ goto out;
+ }
+
+ if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
+ if (tp->linger2 >= 0) {
+ int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ tcp_send_active_reset(sk, GFP_ATOMIC);
goto death;
+ }
- if (!sk->keepopen)
+ if (!sk->keepopen || sk->state == TCP_CLOSE)
goto out;
elapsed = keepalive_time_when(tp);
- if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)))
+
+ /* It is alive without keepalive 8) */
+ if (tp->packets_out || tp->send_head)
goto resched;
elapsed = tcp_time_stamp - tp->rcv_tstamp;
@@ -632,28 +736,30 @@ void tcp_keepalive_timer (unsigned long data)
if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
(tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_write_err(sk, 1);
+ tcp_write_err(sk);
goto out;
}
- tp->probes_out++;
- tp->pending = TIME_KEEPOPEN;
- tcp_write_wakeup(sk);
- elapsed = keepalive_intvl_when(tp);
+ if (tcp_write_wakeup(sk) <= 0) {
+ tp->probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+ /* If keepalive was lost due to local congestion,
+ * try harder.
+ */
+ elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+ }
} else {
/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
- if (keepalive_time_when(tp) > elapsed)
- elapsed = keepalive_time_when(tp) - elapsed;
- else
- elapsed = 0;
+ elapsed = keepalive_time_when(tp) - elapsed;
}
+ TCP_CHECK_TIMER(sk);
+
resched:
tcp_reset_keepalive_timer (sk, elapsed);
goto out;
death:
- tcp_set_state(sk, TCP_CLOSE);
- tcp_clear_xmit_timers(sk);
tcp_done(sk);
out:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9ace56abd..c052d2eb8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.77 2000/01/09 02:19:44 davem Exp $
+ * Version: $Id: udp.c,v 1.79 2000/01/18 08:24:20 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -369,30 +369,15 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
}
/*
- * Various people wanted BSD UDP semantics. Well they've come
- * back out because they slow down response to stuff like dead
- * or unreachable name servers and they screw term users something
- * chronic. Oh and it violates RFC1122. So basically fix your
- * client code people.
- */
-
- /*
* RFC1122: OK. Passes ICMP errors back to application, as per
- * 4.1.3.3. After the comment above, that should be no surprise.
- */
-
- if (!harderr && !sk->protinfo.af_inet.recverr)
- goto out;
-
- /*
- * 4.x BSD compatibility item. Break RFC1122 to
- * get BSD socket semantics.
+ * 4.1.3.3.
*/
- if(sk->bsdism && sk->state!=TCP_ESTABLISHED && !sk->protinfo.af_inet.recverr)
- goto out;
-
- if (sk->protinfo.af_inet.recverr)
+ if (!sk->protinfo.af_inet.recverr) {
+ if (!harderr || sk->state != TCP_ESTABLISHED)
+ goto out;
+ } else {
ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+ }
sk->err = err;
sk->error_report(sk);
out:
@@ -629,15 +614,13 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
switch(cmd)
{
- case TIOCOUTQ:
+ case SIOCOUTQ:
{
- unsigned long amount;
-
- amount = sock_wspace(sk);
+ int amount = atomic_read(&sk->wmem_alloc);
return put_user(amount, (int *)arg);
}
- case TIOCINQ:
+ case SIOCINQ:
{
struct sk_buff *skb;
unsigned long amount;
@@ -663,6 +646,17 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return(0);
}
+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
+{
+ return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
+}
+
+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
+{
+ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ __udp_checksum_complete(skb);
+}
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -699,31 +693,21 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
msg->msg_flags |= MSG_TRUNC;
}
-#ifndef CONFIG_UDP_DELAY_CSUM
- err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
- copied);
-#else
if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
- } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
- if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)))
+ } else if (msg->msg_flags&MSG_TRUNC) {
+ if (__udp_checksum_complete(skb))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
} else {
- unsigned int csum;
+ err = copy_and_csum_toiovec(msg->msg_iov, skb, sizeof(struct udphdr));
- err = 0;
- csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
- csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base,
- copied, csum, &err);
if (err)
- goto out_free;
- if ((unsigned short)csum_fold(csum))
goto csum_copy_err;
}
-#endif
+
if (err)
goto out_free;
sk->stamp=skb->stamp;
@@ -744,7 +728,6 @@ out_free:
out:
return err;
-#ifdef CONFIG_UDP_DELAY_CSUM
csum_copy_err:
UDP_INC_STATS_BH(UdpInErrors);
@@ -768,7 +751,6 @@ csum_copy_err:
* as some normal condition.
*/
return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
-#endif
}
int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -831,9 +813,9 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
* Charge it to the socket, dropping if the queue is full.
*/
-#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+#if defined(CONFIG_FILTER)
if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+ if (__udp_checksum_complete(skb)) {
UDP_INC_STATS_BH(UdpInErrors);
IP_INC_STATS_BH(IpInDiscards);
ip_statistics[smp_processor_id()*2].IpInDelivers--;
@@ -855,12 +837,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
return 0;
}
-
-static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
-{
- udp_queue_rcv_skb(sk, skb);
-}
-
/*
* Multicasts and broadcasts go to each listener.
*
@@ -889,7 +865,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
skb1 = skb_clone(skb, GFP_ATOMIC);
if(skb1)
- udp_deliver(sk, skb1);
+ udp_queue_rcv_skb(sk, skb1);
sk = sknext;
} while(sknext);
} else
@@ -898,30 +874,25 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
return 0;
}
-static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, u32 saddr, u32 daddr,
- int full_csum_deferred)
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+ unsigned short ulen, u32 saddr, u32 daddr)
{
- if (!full_csum_deferred) {
- if (uh->check) {
- if (skb->ip_summed == CHECKSUM_HW &&
- udp_check(uh, ulen, saddr, daddr, skb->csum))
- return -1;
- if (skb->ip_summed == CHECKSUM_NONE &&
- udp_check(uh, ulen, saddr, daddr,
- csum_partial((char *)uh, ulen, 0)))
- return -1;
- }
- } else {
- if (uh->check == 0)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- else if (skb->ip_summed == CHECKSUM_HW) {
- if (udp_check(uh, ulen, saddr, daddr, skb->csum))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
- }
+ if (uh->check == 0) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else if (skb->ip_summed == CHECKSUM_HW) {
+ if (udp_check(uh, ulen, saddr, daddr, skb->csum))
+ return -1;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ /* Probably, we should checksum udp header (it should be in cache
+ * in any case) and data in tiny packets (< rx copybreak).
+ */
return 0;
}
@@ -961,50 +932,33 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
}
skb_trim(skb, ulen);
- if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) {
- int defer;
+ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
+ goto csum_error;
-#ifdef CONFIG_UDP_DELAY_CSUM
- defer = 1;
-#else
- defer = 0;
-#endif
- if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer))
- goto csum_error;
+ if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
- }
sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
-
- if (sk == NULL) {
- /* No socket. Drop packet silently, if checksum is wrong */
- if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, 0))
- goto csum_error;
-
- UDP_INC_STATS_BH(UdpNoPorts);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- /*
- * Hmm. We got an UDP packet to a port to which we
- * don't wanna listen. Ignore it.
- */
- kfree_skb(skb);
- return(0);
- }
- if (udp_checksum_verify(skb, uh, ulen, saddr, daddr,
-#ifdef CONFIG_UDP_DELAY_CSUM
- 1
-#else
- (sk->no_check & UDP_CSUM_NORCV) != 0
-#endif
- )) {
+ if (sk != NULL) {
+ udp_queue_rcv_skb(sk, skb);
sock_put(sk);
- goto csum_error;
+ return 0;
}
- udp_deliver(sk, skb);
- __sock_put(sk);
- return 0;
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_checksum_complete(skb))
+ goto csum_error;
+
+ UDP_INC_STATS_BH(UdpNoPorts);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ /*
+ * Hmm. We got an UDP packet to a port to which we
+ * don't wanna listen. Ignore it.
+ */
+ kfree_skb(skb);
+ return(0);
csum_error:
/*
@@ -1090,10 +1044,6 @@ struct proto udp_prot = {
udp_connect, /* connect */
udp_disconnect, /* disconnect */
NULL, /* accept */
- NULL, /* retransmit */
- NULL, /* write_wakeup */
- NULL, /* read_wakeup */
- datagram_poll, /* poll */
udp_ioctl, /* ioctl */
NULL, /* init */
NULL, /* destroy */
@@ -1107,7 +1057,5 @@ struct proto udp_prot = {
udp_v4_hash, /* hash */
udp_v4_unhash, /* unhash */
udp_v4_get_port, /* good_socknum */
- 128, /* max_header */
- 0, /* retransmits */
"UDP", /* name */
};