summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
commitd6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch)
treee2be02f33984c48ec019c654051d27964e42c441 /net/ipv4/tcp_input.c
parent609d1e803baf519487233b765eb487f9ec227a18 (diff)
Merge with 2.3.19.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c1617
1 files changed, 1140 insertions, 477 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3080bc201..f0711fccc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.170 1999/07/02 11:26:28 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -61,6 +61,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
+#include <net/inet_common.h>
#include <linux/ipsec.h>
#ifdef CONFIG_SYSCTL
@@ -70,6 +71,7 @@
#endif
extern int sysctl_tcp_fin_timeout;
+extern int sysctl_tcp_keepalive_time;
/* These are on by default so the code paths get tested.
* For the final 2.2 this may be undone at our discretion. -DaveM
@@ -81,6 +83,7 @@ int sysctl_tcp_sack = 1;
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;
+int sysctl_tcp_tw_recycle;
static int prune_queue(struct sock *sk);
@@ -133,7 +136,7 @@ static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
/* Tiny-grams with PSH set artifically deflate our
* ato measurement, but with a lower bound.
*/
- if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
+ if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
/* Preserve the quickack state. */
if((tp->ato & 0x7fffffff) > HZ/50)
tp->ato = ((tp->ato & 0x80000000) |
@@ -187,6 +190,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
tp->rto = (tp->srtt >> 3) + tp->mdev;
+ /* I am not enough educated to understand this magic.
+ * However, it smells bad. snd_cwnd>31 is common case.
+ */
tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
}
@@ -209,42 +215,196 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
tp->rto = HZ/5;
}
-/* WARNING: this must not be called if tp->saw_timestamp was false. */
-extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
- __u32 start_seq, __u32 end_seq)
+/* Save metrics learned by this TCP session.
+ This function is called only, when TCP finishes sucessfully
+ i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+static void tcp_update_metrics(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst) {
+ int m;
+
+ if (tp->backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time.
+ * Reset our results.
+ */
+ if (!(dst->mxlock&(1<<RTAX_RTT)))
+ dst->rtt = 0;
+ return;
+ }
+
+ dst_confirm(dst);
+
+ m = dst->rtt - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one,
+ * store new one. Otherwise, use EWMA. Remember,
+ * rtt overestimation is always better than underestimation.
+ */
+ if (!(dst->mxlock&(1<<RTAX_RTT))) {
+ if (m <= 0)
+ dst->rtt = tp->srtt;
+ else
+ dst->rtt -= (m>>3);
+ }
+
+ if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ if (m >= dst->rttvar)
+ dst->rttvar = m;
+ else
+ dst->rttvar -= (dst->rttvar - m)>>2;
+ }
+
+ if (tp->snd_ssthresh == 0x7FFFFFFF) {
+ /* Slow start still did not finish. */
+ if (dst->ssthresh &&
+ !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+ tp->snd_cwnd > dst->ssthresh)
+ dst->ssthresh = tp->snd_cwnd;
+ if (!(dst->mxlock&(1<<RTAX_CWND)) &&
+ tp->snd_cwnd > dst->cwnd)
+ dst->cwnd = tp->snd_cwnd;
+ } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
+ dst->ssthresh = tp->snd_cwnd;
+ if (!(dst->mxlock&(1<<RTAX_CWND)))
+ dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+ if (!(dst->mxlock&(1<<RTAX_CWND)))
+ dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
+ if (dst->ssthresh &&
+ !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+ tp->snd_ssthresh > dst->ssthresh)
+ dst->ssthresh = tp->snd_ssthresh;
+ }
+ }
+}
+
+/* Initialize metrics on socket. */
+
+static void tcp_init_metrics(struct sock *sk)
{
- /* From draft-ietf-tcplw-high-performance: the correct
- * test is last_ack_sent <= end_seq.
- * (RFC1323 stated last_ack_sent < end_seq.)
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ if (dst->rtt == 0)
+ goto reset;
+
+ if (!tp->srtt || !tp->saw_tstamp)
+ goto reset;
+
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
*
- * HOWEVER: The current check contradicts the draft statements.
- * It has been done for good reasons.
- * The implemented check improves security and eliminates
- * unnecessary RTT overestimation.
- * 1998/06/27 Andrey V. Savochkin <saw@msu.ru>
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal curcumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
*/
- if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
- !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
+ if (dst->rtt > tp->srtt)
+ tp->srtt = dst->rtt;
+ if (dst->rttvar > tp->mdev)
+ tp->mdev = dst->rttvar;
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+
+ if (dst->mxlock&(1<<RTAX_CWND))
+ tp->snd_cwnd_clamp = dst->cwnd;
+ if (dst->ssthresh) {
+ tp->snd_ssthresh = dst->ssthresh;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+ return;
+
+
+reset:
+ /* Play conservative. If timestamps are not
+ * supported, TCP will fail to recalculate correct
+ * rtt, if initial rto is too small. FORGET ALL AND RESET!
+ */
+ if (!tp->saw_tstamp && tp->srtt) {
+ tp->srtt = 0;
+ tp->mdev = TCP_TIMEOUT_INIT;
+ tp->rto = TCP_TIMEOUT_INIT;
+ }
+}
+
+#define PAWS_24DAYS (60 * 60 * 24 * 24)
+
+
+/* WARNING: this must not be called if tp->saw_tstamp was false. */
+extern __inline__ void
+tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
+{
+ if (!after(seq, tp->last_ack_sent)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps
+ * and RSTs with bad timestamp option. --ANK
*/
- if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
+
+ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
+ xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
+ tp->ts_recent_stamp = xtime.tv_sec;
}
}
}
-#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
-
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
{
- /* ts_recent must be younger than 24 days */
- return (((s32)(tcp_time_stamp - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
- (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) &&
- /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
- (len != (th->doff * 4))));
+ return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
+ xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
+
+ /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+
+ I cannot see quitely as all the idea behind PAWS
+ is destroyed 8)
+
+ The problem is only in reordering duplicate ACKs.
+ Hence, we can check this rare case more carefully.
+
+ 1. Check that it is really duplicate ACK (ack==snd_una)
+ 2. Give it some small "replay" window (~RTO)
+
+ We do not know units of foreign ts values, but make conservative
+ assumption that they are >=1ms. It solves problem
+ noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
+ */
+ && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
+ !skb->h.th->ack ||
+ (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
}
@@ -283,13 +443,14 @@ static void tcp_reset(struct sock *sk)
case TCP_CLOSE_WAIT:
sk->err = EPIPE;
break;
+ case TCP_CLOSE:
+ return;
default:
sk->err = ECONNRESET;
};
tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
}
/* This tags the retransmission queue when SACKs arrive. */
@@ -345,7 +506,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
{
unsigned char *ptr;
int length=(th->doff*4)-sizeof(struct tcphdr);
- int saw_mss = 0;
ptr = (unsigned char *)(th + 1);
tp->saw_tstamp = 0;
@@ -370,11 +530,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
case TCPOPT_MSS:
if(opsize==TCPOLEN_MSS && th->syn) {
u16 in_mss = ntohs(*(__u16 *)ptr);
- if (in_mss == 0)
- in_mss = 536;
- if (tp->mss_clamp > in_mss)
+ if (in_mss) {
+ if (tp->user_mss && tp->user_mss < in_mss)
+ in_mss = tp->user_mss;
tp->mss_clamp = in_mss;
- saw_mss = 1;
+ }
}
break;
case TCPOPT_WINDOW:
@@ -428,8 +588,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
length-=opsize;
};
}
- if(th->syn && saw_mss == 0)
- tp->mss_clamp = 536;
}
/* Fast parse options. This hopes to only see timestamps.
@@ -448,8 +606,10 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th,
if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->saw_tstamp = 1;
- tp->rcv_tsval = ntohl(*++ptr);
- tp->rcv_tsecr = ntohl(*++ptr);
+ ++ptr;
+ tp->rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rcv_tsecr = ntohl(*ptr);
return 1;
}
}
@@ -461,6 +621,7 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th,
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
{
@@ -498,6 +659,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
tp->dup_acks++;
if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
tp->snd_cwnd = (tp->snd_ssthresh + 3);
tp->high_seq = tp->snd_nxt;
if(!tp->fackets_out)
@@ -595,11 +758,12 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
tp->snd_cwnd_cnt=0;
} else
tp->snd_cwnd_cnt++;
- }
+ }
}
/* Remove acknowledged frames from the retransmission queue. */
@@ -645,9 +809,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
if(tp->fackets_out)
tp->fackets_out--;
} else {
+ acked |= FLAG_SYN_ACKED;
/* This is pure paranoia. */
tp->retrans_head = NULL;
- }
+ }
tp->packets_out--;
*seq = scb->seq;
*seq_rtt = now - scb->when;
@@ -721,7 +886,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*/
- if (!(flag & FLAG_DATA_ACKED))
+ if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
return;
seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
@@ -856,7 +1021,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
- if (flag & FLAG_DATA_ACKED) {
+ if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
tp->backoff = 0;
tcp_rtt_estimator(tp, seq_rtt);
@@ -910,37 +1075,50 @@ uninteresting_ack:
}
/* New-style handling of TIME_WAIT sockets. */
-extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
-extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
-extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
/* Must be called only from BH context. */
void tcp_timewait_kill(struct tcp_tw_bucket *tw)
{
- struct tcp_bind_bucket *tb = tw->tb;
-
- SOCKHASH_LOCK_WRITE_BH();
-
- /* Disassociate with bind bucket. */
- if(tw->bind_next)
- tw->bind_next->bind_pprev = tw->bind_pprev;
- *(tw->bind_pprev) = tw->bind_next;
- if (tb->owners == NULL) {
- if (tb->next)
- tb->next->pprev = tb->pprev;
- *(tb->pprev) = tb->next;
- kmem_cache_free(tcp_bucket_cachep, tb);
- }
+ struct tcp_ehash_bucket *ehead;
+ struct tcp_bind_hashbucket *bhead;
+ struct tcp_bind_bucket *tb;
/* Unlink from established hashes. */
+ ehead = &tcp_ehash[tw->hashent];
+ write_lock(&ehead->lock);
+ if (!tw->pprev) {
+ write_unlock(&ehead->lock);
+ return;
+ }
if(tw->next)
tw->next->pprev = tw->pprev;
- *tw->pprev = tw->next;
+ *(tw->pprev) = tw->next;
+ tw->pprev = NULL;
+ write_unlock(&ehead->lock);
- SOCKHASH_UNLOCK_WRITE_BH();
+ /* Disassociate with bind bucket. */
+ bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
+ spin_lock(&bhead->lock);
+ if ((tb = tw->tb) != NULL) {
+ if(tw->bind_next)
+ tw->bind_next->bind_pprev = tw->bind_pprev;
+ *(tw->bind_pprev) = tw->bind_next;
+ tw->tb = NULL;
+ if (tb->owners == NULL) {
+ if (tb->next)
+ tb->next->pprev = tb->pprev;
+ *(tb->pprev) = tb->next;
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+ }
+ spin_unlock(&bhead->lock);
- /* Ok, now free it up. */
- kmem_cache_free(tcp_timewait_cachep, tw);
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&tw->refcnt) != 1) {
+ printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
+ }
+#endif
+ tcp_tw_put(tw);
}
/* We come here as a special case from the AF specific TCP input processing,
@@ -949,9 +1127,36 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* entire timeout period. The only special cases are for BSD TIME_WAIT
* reconnects and SYN/RST bits being set in the TCP header.
*/
-int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ * (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ * lifetime in the internet, which results in wrong conclusion, that
+ * it is set to catch "old duplicate segments" wandering out of their path.
+ * It is not quite correct. This timeout is calculated so that it exceeds
+ * maximal retransmision timeout enough to allow to lose one (or more)
+ * segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ * finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ * Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
{
+ struct tcp_opt tp;
+ int paws_reject = 0;
+
/* RFC 1122:
* "When a connection is [...] on TIME-WAIT state [...]
* [a TCP] MAY accept a new SYN from the remote TCP to
@@ -965,58 +1170,101 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* (2) returns to TIME-WAIT state if the SYN turns out
* to be an old duplicate".
*/
- if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
- struct sock *sk;
- struct tcp_func *af_specific = tw->af_specific;
- __u32 isn;
- int ret;
-
- isn = tw->rcv_nxt + 128000;
- if(isn == 0)
- isn++;
- tcp_tw_deschedule(tw);
- tcp_timewait_kill(tw);
- sk = af_specific->get_sock(skb, th);
- if(sk == NULL ||
- !ipsec_sk_policy(sk,skb))
- return 0;
- bh_lock_sock(sk);
+ tp.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
+ tcp_parse_options(NULL, th, &tp, 0);
+
+ paws_reject = tp.saw_tstamp &&
+ ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
+ xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
+ }
+
+ if (!paws_reject &&
+ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
+ /* In window segment, it may be only reset or bare ack. */
- /* Default is to discard the frame. */
- ret = 0;
+ if (th->rst) {
+#ifdef CONFIG_TCP_TW_RECYCLE
+ /* When recycling, always follow rfc1337,
+ * but mark bucket as ready to recycling immediately.
+ */
+ if (sysctl_tcp_tw_recycle) {
+ /* May kill it now. */
+ tw->rto = 0;
+ tw->ttd = jiffies;
+ } else
+#endif
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if(sysctl_tcp_rfc1337 == 0) {
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ }
+ } else {
+ tcp_tw_reschedule(tw);
+ }
+
+ if (tp.saw_tstamp) {
+ tw->ts_recent = tp.rcv_tsval;
+ tw->ts_recent_stamp = xtime.tv_sec;
+ }
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* Out of window segment.
- if(sk->lock.users)
- goto out_unlock;
+ All the segments are ACKed immediately.
- skb_set_owner_r(skb, sk);
- af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ The only exception is new SYN. We accept it, if it is
+ not old duplicate and we are not in danger to be killed
+ by delayed old duplicates. RFC check is that it has
+ newer sequence number works at rates <40Mbit/sec.
+ However, if paws works, it is reliable AND even more,
+ we even may relax silly seq space cutoff.
- if(af_specific->conn_request(sk, skb, isn) < 0)
- ret = 1; /* Toss a reset back. */
- out_unlock:
- bh_unlock_sock(sk);
- return ret;
+ RED-PEN: we violate main RFC requirement, if this SYN will appear
+ old duplicate (i.e. we receive RST in reply to SYN-ACK),
+ we must return socket to time-wait state. It is not good,
+ but not fatal yet.
+ */
+
+ if (th->syn && !th->rst && !th->ack && !paws_reject &&
+ (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
+ (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
+ u32 isn = tw->snd_nxt + 2;
+ if (isn == 0)
+ isn++;
+ TCP_SKB_CB(skb)->when = isn;
+ return TCP_TW_SYN;
}
- /* Check RST or SYN */
- if(th->rst || th->syn) {
- /* This is TIME_WAIT assasination, in two flavors.
- * Oh well... nobody has a sufficient solution to this
- * protocol bug yet.
+ if(!th->rst) {
+ /* In this case we must reset the TIMEWAIT timer.
+
+ If it is ACKless SYN it may be both old duplicate
+ and new good SYN with random sequence number <rcv_nxt.
+ Do not reschedule in the last case.
*/
- if(sysctl_tcp_rfc1337 == 0) {
- tcp_tw_deschedule(tw);
- tcp_timewait_kill(tw);
- }
- if(!th->rst)
- return 1; /* toss a reset back */
- } else {
- /* In this case we must reset the TIMEWAIT timer. */
- if(th->ack)
+ if (paws_reject || th->ack) {
tcp_tw_reschedule(tw);
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw->rto = min(120*HZ, tw->rto<<1);
+ tw->ttd = jiffies + tw->rto;
+#endif
+ }
+
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
}
- return 0; /* Discard the frame. */
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
}
/* Enter the time wait state. This is always called from BH
@@ -1024,37 +1272,54 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* relevant info into it from the SK, and mess with hash chains
* and list linkage.
*/
-static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
{
+ struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
+ struct tcp_bind_hashbucket *bhead;
struct sock **head, *sktw;
- /* Step 1: Remove SK from established hash. */
- if(sk->next)
- sk->next->pprev = sk->pprev;
- *sk->pprev = sk->next;
- sk->pprev = NULL;
- tcp_reg_zap(sk);
-
- /* Step 2: Put TW into bind hash where SK was. */
- tw->tb = (struct tcp_bind_bucket *)sk->prev;
- if((tw->bind_next = sk->bind_next) != NULL)
- sk->bind_next->bind_pprev = &tw->bind_next;
- tw->bind_pprev = sk->bind_pprev;
- *sk->bind_pprev = (struct sock *)tw;
- sk->prev = NULL;
+ write_lock(&ehead->lock);
- /* Step 3: Un-charge protocol socket in-use count. */
- sk->prot->inuse--;
+ /* Step 1: Remove SK from established hash. */
+ if (sk->pprev) {
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ }
- /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
- head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
+ /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
+ head = &(ehead + tcp_ehash_size)->chain;
sktw = (struct sock *)tw;
if((sktw->next = *head) != NULL)
(*head)->pprev = &sktw->next;
*head = sktw;
sktw->pprev = head;
+ atomic_inc(&tw->refcnt);
+
+ write_unlock(&ehead->lock);
+
+ /* Step 3: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with sk->num!=0 MUST be bound in binding
+ cache, even if it is closed.
+ */
+ bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
+ spin_lock(&bhead->lock);
+ tw->tb = (struct tcp_bind_bucket *)sk->prev;
+ BUG_TRAP(sk->prev!=NULL);
+ if ((tw->bind_next = tw->tb->owners) != NULL)
+ tw->tb->owners->bind_pprev = &tw->bind_next;
+ tw->tb->owners = (struct sock*)tw;
+ tw->bind_pprev = &tw->tb->owners;
+ spin_unlock(&bhead->lock);
+
+ /* Step 4: Un-charge protocol socket in-use count. */
+ sk->prot->inuse--;
}
+/*
+ * Move a socket to time-wait.
+ */
void tcp_time_wait(struct sock *sk)
{
struct tcp_tw_bucket *tw;
@@ -1071,8 +1336,16 @@ void tcp_time_wait(struct sock *sk)
tw->dport = sk->dport;
tw->family = sk->family;
tw->reuse = sk->reuse;
+ tw->hashent = sk->hashent;
tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
- tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
+ tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent;
+ tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw->rto = sk->tp_pinfo.af_tcp.rto;
+ tw->ttd = jiffies + 2*tw->rto;
+#endif
+ atomic_set(&tw->refcnt, 0);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if(tw->family == PF_INET6) {
@@ -1085,9 +1358,7 @@ void tcp_time_wait(struct sock *sk)
}
#endif
/* Linkage updates. */
- SOCKHASH_LOCK_WRITE();
- tcp_tw_hashdance(sk, tw);
- SOCKHASH_UNLOCK_WRITE();
+ __tcp_tw_hashdance(sk, tw);
/* Get the TIME_WAIT timeout firing. */
tcp_tw_schedule(tw);
@@ -1096,8 +1367,6 @@ void tcp_time_wait(struct sock *sk)
if(sk->state == TCP_ESTABLISHED)
tcp_statistics.TcpCurrEstab--;
sk->state = TCP_CLOSE;
- net_reset_timer(sk, TIME_DONE,
- min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
} else {
/* Sorry, we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -1106,10 +1375,9 @@ void tcp_time_wait(struct sock *sk)
tcp_set_state(sk, TCP_CLOSE);
}
- /* Prevent rcvmsg/sndmsg calls, and wake people up. */
- sk->shutdown = SHUTDOWN_MASK;
- if(!sk->dead)
- sk->state_change(sk);
+ tcp_update_metrics(sk);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
}
/*
@@ -1134,7 +1402,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
tcp_send_ack(sk);
if (!sk->dead) {
- sk->state_change(sk);
+ wake_up_interruptible(sk->sleep);
sock_wake_async(sk->socket, 1);
}
@@ -1143,8 +1411,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- if (th->rst)
- sk->shutdown = SHUTDOWN_MASK;
break;
case TCP_CLOSE_WAIT:
@@ -1161,12 +1427,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
- *
- * This causes a WRITE timeout, which will either
- * move on to TIME_WAIT when we timeout, or resend
- * the FIN properly (maybe we get rid of that annoying
- * FIN lost hang). The TIME_WRITE code is already
- * correct for handling this timeout.
*/
tcp_set_state(sk, TCP_CLOSING);
break;
@@ -1423,7 +1683,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
/* Turn on fast path. */
if (skb_queue_len(&tp->out_of_order_queue) == 0)
tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
- (0x10 << 16) |
+ ntohl(TCP_FLAG_ACK) |
tp->snd_wnd);
return;
}
@@ -1545,8 +1805,8 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
* Now tell the user we may have some data.
*/
if (!sk->dead) {
- SOCK_DEBUG(sk, "Data wakeup.\n");
- sk->data_ready(sk,0);
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1);
}
return(1);
}
@@ -1575,28 +1835,59 @@ static __inline__ void tcp_data_snd_check(struct sock *sk)
/*
* Adapt the MSS value used to make delayed ack decision to the
- * real world.
+ * real world.
+ *
+ * The constant 536 hasn't any good meaning. In IPv4 world
+ * MTU may be smaller, though it contradicts to RFC1122, which
+ * states that MSS must be at least 536.
+ * We use the constant to do not ACK each second
+ * packet in a stream of tiny size packets.
+ * It means that super-low mtu links will be aggressively delacked.
+ * Seems, it is even good. If they have so low mtu, they are weirdly
+ * slow.
+ *
+ * AK: BTW it may be useful to add an option to lock the rcv_mss.
+ * this way the beowulf people wouldn't need ugly patches to get the
+ * ack frequencies they want and it would be an elegant way to tune delack.
*/
static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- unsigned int len = skb->len, lss;
+ unsigned int len, lss;
- if (len > tp->rcv_mss)
- tp->rcv_mss = len;
lss = tp->last_seg_size;
tp->last_seg_size = 0;
- if (len >= 536) {
- if (len == lss)
- tp->rcv_mss = len;
- tp->last_seg_size = len;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb->len;
+ if (len >= tp->rcv_mss) {
+ tp->rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len = skb->tail - skb->h.raw;
+ if (len >= 536 + sizeof(struct tcphdr)) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tp->tcp_header_len;
+ if (len == lss)
+ tp->rcv_mss = len;
+ tp->last_seg_size = len;
+ }
}
}
/*
* Check if sending an ack is needed.
*/
-static __inline__ void __tcp_ack_snd_check(struct sock *sk)
+static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1621,12 +1912,12 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
/* We entered "quick ACK" mode or... */
tcp_in_quickack_mode(tp) ||
/* We have out of order data */
- (skb_peek(&tp->out_of_order_queue) != NULL)) {
+ (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
- tcp_send_delayed_ack(tp, HZ/2);
+ tcp_send_delayed_ack(sk, HZ/2);
}
}
@@ -1637,7 +1928,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/* We sent a data segment already. */
return;
}
- __tcp_ack_snd_check(sk);
+ __tcp_ack_snd_check(sk, 1);
}
@@ -1767,6 +2058,13 @@ static int prune_queue(struct sock *sk)
* complex for anyones sanity. So we don't do it anymore. But
* if we are really having our buffer space abused we stop accepting
* new receive data.
+ *
+ * FIXME: it should recompute SACK state and only remove enough
+ * buffers to get into bounds again. The current scheme loses
+ * badly sometimes on links with large RTT, especially when
+ * the driver has high overhead per skb.
+ * (increasing the rcvbuf is not enough because it inflates the
+ * the window too, disabling flow control effectively) -AK
*/
if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
return 0;
@@ -1782,7 +2080,7 @@ static int prune_queue(struct sock *sk)
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
- * - Out of order segments arrived.
+ * - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
@@ -1790,6 +2088,7 @@ static int prune_queue(struct sock *sk)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
+ * - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
@@ -1801,12 +2100,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int queued;
- u32 flg;
/*
* Header prediction.
- * The code follows the one in the famous
+ * The code losely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
@@ -1819,39 +2116,63 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
* We do checksum and copy also but from device to kernel.
*/
- /*
- * RFC1323: H1. Apply PAWS check first.
- */
- if (tcp_fast_parse_options(sk, th, tp)) {
- if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp, th, len)) {
- tcp_statistics.TcpInErrs++;
- if (!th->rst) {
- tcp_send_ack(sk);
- goto discard;
- }
- }
- tcp_replace_ts_recent(sk, tp,
- TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
- }
- }
- flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);
+ /* RED-PEN. Using static variables to pass function arguments
+ * cannot be good idea...
+ */
+ tp->saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_predition is to be made
* 'S' will always be tp->tcp_header_len >> 2
- * '?' will be 0 else it will be !0
- * (when there are holes in the receive
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
- */
+ */
+
+ if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ int tcp_header_len = th->doff*4;
+
+ /* Timestamp header prediction */
+
+ /* Non-standard header f.e. SACKs -> slow path */
+ if (tcp_header_len != tp->tcp_header_len)
+ goto slow_path;
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ __u32 *ptr = (__u32 *)(th + 1);
- if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- if (len <= th->doff*4) {
+ /* No? Slow path! */
+ if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+ goto slow_path;
+
+ tp->saw_tstamp = 1;
+ ++ptr;
+ tp->rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rcv_tsecr = ntohl(*ptr);
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
+ goto slow_path;
+
+ /* Predicted packet is in window by definition.
+ seq == rcv_nxt and last_ack_sent <= rcv_nxt.
+ Hence, check seq<=last_ack_sent reduces to:
+ */
+ if (tp->rcv_nxt == tp->last_ack_sent) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
+ }
+ }
+
+ if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
- if (len == th->doff*4) {
+ if (len == tcp_header_len) {
tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->ack_seq, len);
kfree_skb(skb);
@@ -1864,12 +2185,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
/* Bulk data transfer: receiver */
- __skb_pull(skb,th->doff*4);
+ __skb_pull(skb,tcp_header_len);
+ /* Is it possible to simplify this? */
tcp_measure_rcv_mss(sk, skb);
/* DO NOT notify forward progress here.
* It saves dozen of CPU instructions in fast path. --ANK
+ * And where is it signaled then ? -AK
*/
__skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -1877,14 +2200,37 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* FIN bit check is not done since if FIN is set in
* this frame, the pred_flags won't match up. -DaveM
*/
- sk->data_ready(sk, 0);
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1);
tcp_delack_estimator(tp);
tcp_remember_ack(tp, th, skb);
- __tcp_ack_snd_check(sk);
+ __tcp_ack_snd_check(sk, 0);
return 0;
}
+ /* Packet is in sequence, flags are trivial;
+ * only ACK is strange or we are tough on memory.
+ * Jump to step 5.
+ */
+ goto step5;
+ }
+
+slow_path:
+ /*
+ * RFC1323: H1. Apply PAWS check first.
+ */
+ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+ /* Resets are accepted even if PAWS failed.
+
+ ts_recent update must be made after we are sure
+ that the packet is in window.
+ */
}
/*
@@ -1909,44 +2255,34 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto discard;
}
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ if (tp->saw_tstamp) {
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq);
+ }
+
if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
SOCK_DEBUG(sk, "syn in established state\n");
tcp_statistics.TcpInErrs++;
tcp_reset(sk);
return 1;
}
-
- if(th->rst) {
- tcp_reset(sk);
- goto discard;
- }
+step5:
if(th->ack)
tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
/* Process urgent data. */
tcp_urg(sk, th, len);
+ {
/* step 7: process the segment text */
- queued = tcp_data(skb, sk, len);
+ int queued = tcp_data(skb, sk, len);
- /* This must be after tcp_data() does the skb_pull() to
- * remove the header size from skb->len.
- *
- * Dave!!! Phrase above (and all about rcv_mss) has
- * nothing to do with reality. rcv_mss must measure TOTAL
- * size, including sacks, IP options etc. Hence, measure_rcv_mss
- * must occure before pulling etc, otherwise it will flap
- * like hell. Even putting it before tcp_data is wrong,
- * it should use skb->tail - skb->nh.raw instead.
- * --ANK (980805)
- *
- * BTW I broke it. Now all TCP options are handled equally
- * in mss_clamp calculations (i.e. ignored, rfc1122),
- * and mss_cache does include all of them (i.e. tstamps)
- * except for sacks, to calulate effective mss faster.
- * --ANK (980805)
- */
tcp_measure_rcv_mss(sk, skb);
/* Be careful, tcp_data() may have put this into TIME_WAIT. */
@@ -1959,76 +2295,541 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
discard:
kfree_skb(skb);
}
+ }
return 0;
}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
+
+ if(newsk != NULL) {
+ struct tcp_opt *newtp;
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter;
+#endif
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->state = TCP_SYN_RECV;
+
+ /* SANITY */
+ newsk->pprev = NULL;
+ newsk->prev = NULL;
+
+ /* Clone the TCP header template */
+ newsk->dport = req->rmt_port;
+
+ sock_lock_init(newsk);
+
+ atomic_set(&newsk->rmem_alloc, 0);
+ skb_queue_head_init(&newsk->receive_queue);
+ atomic_set(&newsk->wmem_alloc, 0);
+ skb_queue_head_init(&newsk->write_queue);
+ atomic_set(&newsk->omem_alloc, 0);
+
+ newsk->done = 0;
+ newsk->proc = 0;
+ newsk->backlog.head = newsk->backlog.tail = NULL;
+ skb_queue_head_init(&newsk->error_queue);
+ newsk->write_space = tcp_write_space;
+#ifdef CONFIG_FILTER
+ if ((filter = newsk->filter) != NULL)
+ sk_filter_charge(newsk, filter);
+#endif
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->srtt = 0;
+ newtp->ato = 0;
+ newtp->snd_wl1 = req->rcv_isn;
+ newtp->snd_wl2 = req->snt_isn;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ newtp->snd_wnd = ntohs(skb->h.th->window);
+
+ newtp->max_window = newtp->snd_wnd;
+ newtp->pending = 0;
+ newtp->retransmits = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
+ newtp->backoff = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = 2;
+
+ newtp->rto = TCP_TIMEOUT_INIT;
+ newtp->packets_out = 0;
+ newtp->fackets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->high_seq = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+ newtp->snd_cwnd_cnt = 0;
+ newtp->dup_acks = 0;
+ newtp->delayed_acks = 0;
+ init_timer(&newtp->retransmit_timer);
+ newtp->retransmit_timer.function = &tcp_retransmit_timer;
+ newtp->retransmit_timer.data = (unsigned long) newsk;
+ init_timer(&newtp->delack_timer);
+ newtp->delack_timer.function = &tcp_delack_timer;
+ newtp->delack_timer.data = (unsigned long) newsk;
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->send_head = newtp->retrans_head = NULL;
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->saw_tstamp = 0;
+
+ init_timer(&newtp->probe_timer);
+ newtp->probe_timer.function = &tcp_probe_timer;
+ newtp->probe_timer.data = (unsigned long) newsk;
+ newtp->probes_out = 0;
+ newtp->syn_seq = req->rcv_isn;
+ newtp->fin_seq = req->rcv_isn;
+ newtp->urg_data = 0;
+ tcp_synq_init(newtp);
+ newtp->syn_backlog = 0;
+ if (skb->len >= 536)
+ newtp->last_seg_size = skb->len;
+
+ /* Back to base struct sock members. */
+ newsk->err = 0;
+ newsk->ack_backlog = 0;
+ newsk->max_ack_backlog = SOMAXCONN;
+ newsk->priority = 0;
+ atomic_set(&newsk->refcnt, 1);
+ atomic_inc(&inet_sock_nr);
+
+ spin_lock_init(&sk->timer_lock);
+ init_timer(&newsk->timer);
+ newsk->timer.function = &tcp_keepalive_timer;
+ newsk->timer.data = (unsigned long) newsk;
+ if (newsk->keepopen)
+ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
+ newsk->socket = NULL;
+ newsk->sleep = NULL;
+
+ newtp->tstamp_ok = req->tstamp_ok;
+ if((newtp->sack_ok = req->sack_ok) != 0)
+ newtp->num_sacks = 0;
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->wscale_ok = req->wscale_ok;
+ if (newtp->wscale_ok) {
+ newtp->snd_wscale = req->snd_wscale;
+ newtp->rcv_wscale = req->rcv_wscale;
+ } else {
+ newtp->snd_wscale = newtp->rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp,65535);
+ }
+ if (newtp->tstamp_ok) {
+ newtp->ts_recent = req->ts_recent;
+ newtp->ts_recent_stamp = xtime.tv_sec;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->mss_clamp = req->mss;
+ }
+ return newsk;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return 1;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return 1;
+ return (seq == e_win && seq == end_seq);
+}
+
+
/*
- * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
- * as an open_request.
+ * Process an incoming packet for SYN_RECV sockets represented
+ * as an open_request.
*/
-struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
- struct open_request *req)
+struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+ struct open_request *req,
+ struct open_request *prev)
{
+ struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- u32 flg;
+ u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ int paws_reject = 0;
+ struct tcp_opt ttp;
- /* assumption: the socket is not in use.
- * as we checked the user count on tcp_rcv and we're
- * running from a soft interrupt.
+ /* If socket has already been created, process
+ packet in its context.
+
+ We fall here only due to race, when packets were enqueued
+ to backlog of listening socket.
*/
+ if (req->sk)
+ return req->sk;
- /* Check for syn retransmission */
- flg = *(((u32 *)skb->h.th) + 3);
-
- flg &= __constant_htonl(0x00170000);
- /* Only SYN set? */
- if (flg == __constant_htonl(0x00020000)) {
- if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
- /* retransmited syn.
+ ttp.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+
+ tcp_parse_options(NULL, th, &ttp, 0);
+
+ paws_reject = ttp.saw_tstamp &&
+ (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
+ }
+
+ /* Check for pure retransmited SYN. */
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+ flg == TCP_FLAG_SYN &&
+ !paws_reject) {
+ /*
+ * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+ * this case on figure 6 and figure 8, but formal
+ * protocol description says NOTHING.
+ * To be more exact, it says that we should send ACK,
+ * because this segment (at least, if it has no data)
+ * is out of window.
+ *
+ * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+ * describe SYN-RECV state. All the description
+ * is wrong, we cannot believe to it and should
+ * rely only on common sense and implementation
+ * experience.
+ *
+ * Enforce "SYN-ACK" according to figure 8, figure 6
+ * of RFC793, fixed by RFC1122.
+ */
+ req->class->rtx_syn_ack(sk, req);
+ return NULL;
+ }
+
+ /* Further reproduces section "SEGMENT ARRIVES"
+ for state SYN-RECEIVED of RFC793.
+ It is broken, however, it does not work only
+ when SYNs are crossed, which is impossible in our
+ case.
+
+ But generally, we should (RFC lies!) to accept ACK
+ from SYNACK both here and in tcp_rcv_state_process().
+ tcp_rcv_state_process() does not, hence, we do not too.
+
+ Note that the case is absolutely generic:
+ we cannot optimize anything here without
+ violating protocol. All the checks must be made
+ before attempt to create socket.
+ */
+
+ /* RFC793: "first check sequence number". */
+
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+ /* Out of window: send ACK and drop. */
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_ack(skb, req);
+ return NULL;
+ }
+
+ /* In sequence, PAWS is OK. */
+
+ if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+ req->ts_recent = ttp.rcv_tsval;
+
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at req->rcv_isn+1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
+
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
+ goto embryonic_reset;
+
+ /* RFC793: "fifth check the ACK field" */
+
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
+
+ /* Invalid ACK: reset will be sent by listening socket */
+ if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
+ return sk;
+
+ /* OK, ACK is valid, create big socket and
+ feed this segment to it. It will repeat all
+ the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ ESTABLISHED STATE. If it will be dropped after
+ socket is created, wait for troubles.
+ */
+ sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ if (sk == NULL)
+ return NULL;
+
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ req->sk = sk;
+ return sk;
+
+embryonic_reset:
+ tcp_synq_unlink(tp, req, prev);
+ tp->syn_backlog--;
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+
+ net_statistics.EmbryonicRsts++;
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_reset(skb);
+
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+ return NULL;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tcp_parse_options(sk, th, tp, 0);
+
+#ifdef CONFIG_TCP_TW_RECYCLE
+ if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
+ (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
+ xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
+ /* Old duplicate segment. We remember last
+ ts_recent from this host in timewait bucket.
+
+ Actually, we could implement per host cache
+ to truncate timewait state after RTO. Paranoidal arguments
+ of rfc1337 are not enough to close this nice possibility.
+ */
+ if (net_ratelimit())
+ printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
+ if (th->ack)
+ return 1;
+ goto discard;
+ }
+#endif
+
+ if (th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ *
+ * I cite this place to emphasize one essential
+ * detail, this check is different of one
+ * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
+ * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
+ * because we have no previous data sent before SYN.
+ * --ANK(990513)
+ *
+ * We do not send data with SYN, so that RFC-correct
+ * test reduces to:
+ */
+ if (sk->zapped ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ return 1;
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+ if (!th->syn)
+ goto discard;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ *
+ * Do you see? SYN-less ACKs in SYN-SENT state are
+ * completely ignored.
+ *
+ * The bug causing stalled SYN-SENT sockets
+ * was here: tcp_ack advanced snd_una and canceled
+ * retransmit timer, so that bare ACK received
+ * in SYN-SENT state (even with invalid ack==ISS,
+ * because tcp_ack check is too weak for SYN-SENT)
+ * causes moving socket to invalid semi-SYN-SENT,
+ * semi-ESTABLISHED state and connection hangs.
+ *
+ * There exist buggy stacks, which really send
+ * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
+ * Actually, if this host did not try to get something
+ * from ftp.inr.ac.ru I'd never find this bug 8)
+ *
+ * --ANK (990514)
+ *
+ * I was wrong, I apologize. Bare ACK is valid.
+ * Actually, RFC793 requires to send such ACK
+ * in reply to any out of window packet.
+ * It is wrong, but Linux also does it sometimes.
+ * --ANK (990724)
+ */
+
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ tp->fin_seq = TCP_SKB_CB(skb)->seq;
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (tp->wscale_ok == 0) {
+ tp->snd_wscale = tp->rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp,65535);
+ }
+
+ if (tp->tstamp_ok) {
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
+ }
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+ tcp_init_metrics(sk);
+
+ if (tp->write_pending) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * How to make this correctly?
*/
- req->class->rtx_syn_ack(sk, req);
- return NULL;
+ tp->delayed_acks++;
+ if (tp->ato == 0)
+ tp->ato = tp->rto;
+ tcp_send_delayed_ack(sk, tp->rto);
} else {
- return sk; /* Pass new SYN to the listen socket. */
+ tcp_send_ack(sk);
}
+
+ tp->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket, 0);
+ }
+ return -1;
}
- /* We know it's an ACK here */
- if (req->sk) {
- /* socket already created but not
- * yet accepted()...
+ /* No ACK in the segment */
+
+ if (th->rst) {
+ /* rfc793:
+ * "If the RST bit is set
+ *
+ * Otherwise (no ACK) drop the segment and return."
*/
- sk = req->sk;
- } else {
- /* In theory the packet could be for a cookie, but
- * TIME_WAIT should guard us against this.
- * XXX: Nevertheless check for cookies?
- * This sequence number check is done again later,
- * but we do it here to prevent syn flood attackers
- * from creating big SYN_RECV sockets.
- */
- if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
- !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
- req->rcv_isn+1+req->rcv_wnd)) {
- req->class->send_reset(skb);
- return NULL;
+
+ goto discard;
+ }
+
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+ *
+ * The previous version of the code
+ * checked for "connecting to self"
+ * here. that check is done now in
+ * tcp_connect.
+ *
+ * RED-PEN: BTW, it does not. 8)
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
}
-
- sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- if (sk == NULL)
- return NULL;
-
- req->expires = 0UL;
- req->sk = sk;
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ tcp_send_synack(sk);
+#if 0
+ /* Note, we could accept data and URG from this segment.
+ * There are no obstacles to make this.
+ *
+ * However, if we ignore data in ACKless segments sometimes,
+ * we have no reasons to accept it sometimes.
+ * Also, seems the code doing it in step6 of tcp_rcv_state_process
+ * is not flawless. So, discard packet for sanity.
+ * Uncomment this return to process the data.
+ */
+ return -1;
+#endif
}
- skb_orphan(skb);
- skb_set_owner_r(skb, sk);
- return sk;
+ /* "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ */
+
+discard:
+ kfree_skb(skb);
+ return 0;
}
+
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
@@ -2042,6 +2843,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int queued = 0;
+ tp->saw_tstamp = 0;
+
switch (sk->state) {
case TCP_CLOSE:
/* When state == CLOSED, hash lookup always fails.
@@ -2061,35 +2864,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* a TCP_CLOSE socket does not exist. Drop the frame
* and send a RST back to the other end.
*/
- return 1;
- case TCP_LISTEN:
- /* These use the socket TOS..
- * might want to be the received TOS
+ /* 1. The socket may be moved to TIME-WAIT state.
+ 2. While this socket was locked, another socket
+ with the same identity could be created.
+ 3. To continue?
+
+ CONCLUSION: discard and only discard!
+
+ Alternative would be relookup and recurse into tcp_v?_rcv
+ (not *_do_rcv) to work with timewait and listen states
+ correctly.
*/
- if(th->ack) {
- struct sock *realsk;
- int ret;
+ goto discard;
- realsk = tp->af_specific->get_sock(skb, th);
- if(realsk == sk)
- return 1;
+ case TCP_LISTEN:
+ if(th->ack)
+ return 1;
- bh_lock_sock(realsk);
- ret = 0;
- if(realsk->lock.users != 0) {
- skb_orphan(skb);
- sk_add_backlog(realsk, skb);
- } else {
- ret = tcp_rcv_state_process(realsk, skb,
- skb->h.th, skb->len);
- }
- bh_unlock_sock(realsk);
- return ret;
- }
-
if(th->syn) {
- if(tp->af_specific->conn_request(sk, skb, 0) < 0)
+ if(tp->af_specific->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
@@ -2110,172 +2904,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
goto discard;
}
-
goto discard;
- break;
case TCP_SYN_SENT:
- /* SYN sent means we have to look for a suitable ack and
- * either reset for bad matches or go to connected.
- * The SYN_SENT case is unusual and should
- * not be in line code. [AC]
- */
- if(th->ack) {
- /* rfc793:
- * "If the state is SYN-SENT then
- * first check the ACK bit
- * If the ACK bit is set
- * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
- * a reset (unless the RST bit is set, if so drop
- * the segment and return)"
- *
- * I cite this place to emphasize one essential
- * detail, this check is different of one
- * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
- * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
- * because we have no previous data sent before SYN.
- * --ANK(990513)
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
- */
- if (sk->zapped ||
- TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
- return 1;
-
- /* Now ACK is acceptable.
- *
- * "If the RST bit is set
- * If the ACK was acceptable then signal the user "error:
- * connection reset", drop the segment, enter CLOSED state,
- * delete TCB, and return."
- */
-
- if (th->rst) {
- tcp_reset(sk);
- goto discard;
- }
-
- /* rfc793:
- * "fifth, if neither of the SYN or RST bits is set then
- * drop the segment and return."
- *
- * See note below!
- * --ANK(990513)
- */
-
- if (!th->syn)
- goto discard;
-
- /* rfc793:
- * "If the SYN bit is on ...
- * are acceptable then ...
- * (our SYN has been ACKed), change the connection
- * state to ESTABLISHED..."
- *
- * Do you see? SYN-less ACKs in SYN-SENT state are
- * completely ignored.
- *
- * The bug causing stalled SYN-SENT sockets
- * was here: tcp_ack advanced snd_una and canceled
- * retransmit timer, so that bare ACK received
- * in SYN-SENT state (even with invalid ack==ISS,
- * because tcp_ack check is too weak for SYN-SENT)
- * causes moving socket to invalid semi-SYN-SENT,
- * semi-ESTABLISHED state and connection hangs.
- *
- * There exist buggy stacks, which really send
- * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
- * Actually, if this host did not try to get something
- * from ftp.inr.ac.ru I'd never find this bug 8)
- *
- * --ANK (990514)
- */
-
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
- tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->ack_seq, len);
-
- /* Ok.. it's good. Set up sequence numbers and
- * move to established.
- */
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
- tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is
- * never scaled.
- */
- tp->snd_wnd = htons(th->window);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
- tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
- tp->fin_seq = TCP_SKB_CB(skb)->seq;
-
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_parse_options(sk, th, tp, 0);
-
- if (tp->wscale_ok == 0) {
- tp->snd_wscale = tp->rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp,65535);
- }
-
- if (tp->tstamp_ok) {
- tp->tcp_header_len =
- sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
- } else
- tp->tcp_header_len = sizeof(struct tcphdr);
- if (tp->saw_tstamp) {
- tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
- }
-
- /* Can't be earlier, doff would be wrong. */
- tcp_send_ack(sk);
-
- sk->dport = th->source;
- tp->copied_seq = tp->rcv_nxt;
-
- if(!sk->dead) {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 0);
- }
- } else {
- if(th->syn && !th->rst) {
- /* The previous version of the code
- * checked for "connecting to self"
- * here. that check is done now in
- * tcp_connect.
- */
- tcp_set_state(sk, TCP_SYN_RECV);
- tcp_parse_options(sk, th, tp, 0);
- if (tp->saw_tstamp) {
- tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
- }
-
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is
- * never scaled.
- */
- tp->snd_wnd = htons(th->window);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
-
- tcp_send_synack(sk);
- } else
- break;
- }
-
- /* tp->tcp_header_len and tp->mss_clamp
- probably changed, synchronize mss.
- */
- tcp_sync_mss(sk, tp->pmtu_cookie);
- tp->rcv_mss = tp->mss_cache;
-
- if (sk->state == TCP_SYN_RECV)
- goto discard;
-
- goto step6;
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ if (queued >= 0)
+ return queued;
+ queued = 0;
+ goto step6;
}
/* Parse the tcp_options present on this header.
@@ -2283,23 +2919,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* Note that this really has to be here and not later for PAWS
* (RFC1323) to work.
*/
- if (tcp_fast_parse_options(sk, th, tp)) {
- /* NOTE: assumes saw_tstamp is never set if we didn't
- * negotiate the option. tcp_fast_parse_options() must
- * guarantee this.
- */
- if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp, th, len)) {
- tcp_statistics.TcpInErrs++;
- if (!th->rst) {
- tcp_send_ack(sk);
- goto discard;
- }
- }
- tcp_replace_ts_recent(sk, tp,
- TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
+ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
}
+ /* Reset is accepted even if it did not pass PAWS. */
}
/* The silly FIN test here is necessary to see an advancing ACK in
@@ -2313,11 +2939,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* At this point the connection will deadlock with host1 believing
* that his FIN is never ACK'd, and thus it will retransmit it's FIN
* forever. The following fix is from Taral (taral@taral.net).
+ *
+ * RED-PEN. Seems, the above is not true.
+ * If at least one end is RFC compliant, it will send ACK to
+ * out of window FIN and, hence, move peer to TIME-WAIT.
+ * I comment out this line. --ANK
+ *
+ * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
+ * received in SYN-RECV. The problem is that description of
+ * segment processing in SYN-RECV state in RFC792 is WRONG.
+ * Correct check would accept ACK from this SYN-ACK, see
+ * figures 6 and 8 (fixed by RFC1122). Compare this
+ * to problem with FIN, they smell similarly. --ANK
*/
/* step 1: check sequence number */
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
- !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
+#if 0
+ && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+#endif
+ ) {
if (!th->rst) {
tcp_send_ack(sk);
}
@@ -2330,6 +2971,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
}
+ if (tp->saw_tstamp) {
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq);
+ }
+
/* step 3: check security and precedence [ignored] */
/* step 4:
@@ -2357,22 +3003,36 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (th->ack) {
int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->ack_seq, len);
-
+
switch(sk->state) {
case TCP_SYN_RECV:
if (acceptable) {
tcp_set_state(sk, TCP_ESTABLISHED);
- sk->dport = th->source;
tp->copied_seq = tp->rcv_nxt;
- if(!sk->dead)
- sk->state_change(sk);
+ /* Note, that this wakeup is only for marginal
+ crossed SYN case. Passively open sockets
+ are not waked up, because sk->sleep == NULL
+ and sk->socket == NULL.
+ */
+ if (!sk->dead && sk->sleep) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket, 1);
+ }
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = htons(th->window) << tp->snd_wscale;
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ /* tcp_ack considers this ACK as duplicate
+ * and does not calculate rtt. It is wrong.
+ * Fix it at least with timestamps.
+ */
+ if (tp->saw_tstamp && !tp->srtt)
+ tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
+
+ tcp_init_metrics(sk);
} else {
SOCK_DEBUG(sk, "bad ack\n");
return 1;
@@ -2386,7 +3046,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (!sk->dead)
sk->state_change(sk);
else
- tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
+ tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
+ dst_confirm(sk->dst_cache);
}
break;
@@ -2399,10 +3060,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
- sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
- if (!sk->dead)
- sk->state_change(sk);
+ tcp_update_metrics(sk);
+ tcp_done(sk);
goto discard;
}
break;
@@ -2444,8 +3104,11 @@ step6:
break;
}
- tcp_data_snd_check(sk);
- tcp_ack_snd_check(sk);
+ /* tcp_data could move socket to TIME-WAIT */
+ if (sk->state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
if (!queued) {
discard: