diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
commit | d6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch) | |
tree | e2be02f33984c48ec019c654051d27964e42c441 /net/ipv4/tcp_input.c | |
parent | 609d1e803baf519487233b765eb487f9ec227a18 (diff) |
Merge with 2.3.19.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1617 |
1 files changed, 1140 insertions, 477 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3080bc201..f0711fccc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.170 1999/07/02 11:26:28 davem Exp $ + * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -61,6 +61,7 @@ #include <linux/mm.h> #include <linux/sysctl.h> #include <net/tcp.h> +#include <net/inet_common.h> #include <linux/ipsec.h> #ifdef CONFIG_SYSCTL @@ -70,6 +71,7 @@ #endif extern int sysctl_tcp_fin_timeout; +extern int sysctl_tcp_keepalive_time; /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM @@ -81,6 +83,7 @@ int sysctl_tcp_sack = 1; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; +int sysctl_tcp_tw_recycle; static int prune_queue(struct sock *sk); @@ -133,7 +136,7 @@ static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, /* Tiny-grams with PSH set artifically deflate our * ato measurement, but with a lower bound. */ - if(th->psh && (skb->len < (tp->mss_cache >> 1))) { + if(th->psh && (skb->len < (tp->rcv_mss >> 1))) { /* Preserve the quickack state. */ if((tp->ato & 0x7fffffff) > HZ/50) tp->ato = ((tp->ato & 0x80000000) | @@ -187,6 +190,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) static __inline__ void tcp_set_rto(struct tcp_opt *tp) { tp->rto = (tp->srtt >> 3) + tp->mdev; + /* I am not enough educated to understand this magic. + * However, it smells bad. snd_cwnd>31 is common case. + */ tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); } @@ -209,42 +215,196 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) tp->rto = HZ/5; } -/* WARNING: this must not be called if tp->saw_timestamp was false. */ -extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, - __u32 start_seq, __u32 end_seq) +/* Save metrics learned by this TCP session. + This function is called only, when TCP finishes sucessfully + i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. + */ +static void tcp_update_metrics(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst) { + int m; + + if (tp->backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst->mxlock&(1<<RTAX_RTT))) + dst->rtt = 0; + return; + } + + dst_confirm(dst); + + m = dst->rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst->mxlock&(1<<RTAX_RTT))) { + if (m <= 0) + dst->rtt = tp->srtt; + else + dst->rtt -= (m>>3); + } + + if (!(dst->mxlock&(1<<RTAX_RTTVAR))) { + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + if (m >= dst->rttvar) + dst->rttvar = m; + else + dst->rttvar -= (dst->rttvar - m)>>2; + } + + if (tp->snd_ssthresh == 0x7FFFFFFF) { + /* Slow start still did not finish. */ + if (dst->ssthresh && + !(dst->mxlock&(1<<RTAX_SSTHRESH)) && + tp->snd_cwnd > dst->ssthresh) + dst->ssthresh = tp->snd_cwnd; + if (!(dst->mxlock&(1<<RTAX_CWND)) && + tp->snd_cwnd > dst->cwnd) + dst->cwnd = tp->snd_cwnd; + } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!(dst->mxlock&(1<<RTAX_SSTHRESH))) + dst->ssthresh = tp->snd_cwnd; + if (!(dst->mxlock&(1<<RTAX_CWND))) + dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1; + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!(dst->mxlock&(1<<RTAX_CWND))) + dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1; + if (dst->ssthresh && + !(dst->mxlock&(1<<RTAX_SSTHRESH)) && + tp->snd_ssthresh > dst->ssthresh) + dst->ssthresh = tp->snd_ssthresh; + } + } +} + +/* Initialize metrics on socket. */ + +static void tcp_init_metrics(struct sock *sk) { - /* From draft-ietf-tcplw-high-performance: the correct - * test is last_ack_sent <= end_seq. - * (RFC1323 stated last_ack_sent < end_seq.) + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst->rtt == 0) + goto reset; + + if (!tp->srtt || !tp->saw_tstamp) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. * - * HOWEVER: The current check contradicts the draft statements. - * It has been done for good reasons. - * The implemented check improves security and eliminates - * unnecessary RTT overestimation. - * 1998/06/27 Andrey V. Savochkin <saw@msu.ru> + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal curcumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. */ - if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) && - !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) { + if (dst->rtt > tp->srtt) + tp->srtt = dst->rtt; + if (dst->rttvar > tp->mdev) + tp->mdev = dst->rttvar; + tcp_set_rto(tp); + tcp_bound_rto(tp); + + if (dst->mxlock&(1<<RTAX_CWND)) + tp->snd_cwnd_clamp = dst->cwnd; + if (dst->ssthresh) { + tp->snd_ssthresh = dst->ssthresh; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + return; + + +reset: + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = TCP_TIMEOUT_INIT; + tp->rto = TCP_TIMEOUT_INIT; + } +} + +#define PAWS_24DAYS (60 * 60 * 24 * 24) + + +/* WARNING: this must not be called if tp->saw_tstamp was false. */ +extern __inline__ void +tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) +{ + if (!after(seq, tp->last_ack_sent)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps + * and RSTs with bad timestamp option. --ANK */ - if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { + + if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || + xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) { tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; + tp->ts_recent_stamp = xtime.tv_sec; } } } -#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) - -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len) +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) { - /* ts_recent must be younger than 24 days */ - return (((s32)(tcp_time_stamp - tp->ts_recent_stamp) >= PAWS_24DAYS) || - (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) && - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ - (len != (th->doff * 4)))); + return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && + xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS + + /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + + I cannot see quitely as all the idea behind PAWS + is destroyed 8) + + The problem is only in reordering duplicate ACKs. + Hence, we can check this rare case more carefully. + + 1. Check that it is really duplicate ACK (ack==snd_una) + 2. Give it some small "replay" window (~RTO) + + We do not know units of foreign ts values, but make conservative + assumption that they are >=1ms. It solves problem + noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK + */ + && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq || + TCP_SKB_CB(skb)->ack_seq != tp->snd_una || + !skb->h.th->ack || + (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ)); } @@ -283,13 +443,14 @@ static void tcp_reset(struct sock *sk) case TCP_CLOSE_WAIT: sk->err = EPIPE; break; + case TCP_CLOSE: + return; default: sk->err = ECONNRESET; }; tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) - sk->state_change(sk); + tcp_clear_xmit_timers(sk); + tcp_done(sk); } /* This tags the retransmission queue when SACKs arrive. */ @@ -345,7 +506,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); - int saw_mss = 0; ptr = (unsigned char *)(th + 1); tp->saw_tstamp = 0; @@ -370,11 +530,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i case TCPOPT_MSS: if(opsize==TCPOLEN_MSS && th->syn) { u16 in_mss = ntohs(*(__u16 *)ptr); - if (in_mss == 0) - in_mss = 536; - if (tp->mss_clamp > in_mss) + if (in_mss) { + if (tp->user_mss && tp->user_mss < in_mss) + in_mss = tp->user_mss; tp->mss_clamp = in_mss; - saw_mss = 1; + } } break; case TCPOPT_WINDOW: @@ -428,8 +588,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i length-=opsize; }; } - if(th->syn && saw_mss == 0) - tp->mss_clamp = 536; } /* Fast parse options. This hopes to only see timestamps. @@ -448,8 +606,10 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*++ptr); - tp->rcv_tsecr = ntohl(*++ptr); + ++ptr; + tp->rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rcv_tsecr = ntohl(*ptr); return 1; } } @@ -461,6 +621,7 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */ static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) { @@ -498,6 +659,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; tp->snd_cwnd = (tp->snd_ssthresh + 3); tp->high_seq = tp->snd_nxt; if(!tp->fackets_out) @@ -595,11 +758,12 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt=0; } else tp->snd_cwnd_cnt++; - } + } } /* Remove acknowledged frames from the retransmission queue. */ @@ -645,9 +809,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, if(tp->fackets_out) tp->fackets_out--; } else { + acked |= FLAG_SYN_ACKED; /* This is pure paranoia. */ tp->retrans_head = NULL; - } + } tp->packets_out--; *seq = scb->seq; *seq_rtt = now - scb->when; @@ -721,7 +886,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, * See draft-ietf-tcplw-high-performance-00, section 3.3. * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> */ - if (!(flag & FLAG_DATA_ACKED)) + if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED))) return; seq_rtt = tcp_time_stamp - tp->rcv_tsecr; @@ -856,7 +1021,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ - if (flag & FLAG_DATA_ACKED) { + if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) { if(!(flag & FLAG_RETRANS_DATA_ACKED)) { tp->backoff = 0; tcp_rtt_estimator(tp, seq_rtt); @@ -910,37 +1075,50 @@ uninteresting_ack: } /* New-style handling of TIME_WAIT sockets. */ -extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); /* Must be called only from BH context. */ void tcp_timewait_kill(struct tcp_tw_bucket *tw) { - struct tcp_bind_bucket *tb = tw->tb; - - SOCKHASH_LOCK_WRITE_BH(); - - /* Disassociate with bind bucket. */ - if(tw->bind_next) - tw->bind_next->bind_pprev = tw->bind_pprev; - *(tw->bind_pprev) = tw->bind_next; - if (tb->owners == NULL) { - if (tb->next) - tb->next->pprev = tb->pprev; - *(tb->pprev) = tb->next; - kmem_cache_free(tcp_bucket_cachep, tb); - } + struct tcp_ehash_bucket *ehead; + struct tcp_bind_hashbucket *bhead; + struct tcp_bind_bucket *tb; /* Unlink from established hashes. */ + ehead = &tcp_ehash[tw->hashent]; + write_lock(&ehead->lock); + if (!tw->pprev) { + write_unlock(&ehead->lock); + return; + } if(tw->next) tw->next->pprev = tw->pprev; - *tw->pprev = tw->next; + *(tw->pprev) = tw->next; + tw->pprev = NULL; + write_unlock(&ehead->lock); - SOCKHASH_UNLOCK_WRITE_BH(); + /* Disassociate with bind bucket. */ + bhead = &tcp_bhash[tcp_bhashfn(tw->num)]; + spin_lock(&bhead->lock); + if ((tb = tw->tb) != NULL) { + if(tw->bind_next) + tw->bind_next->bind_pprev = tw->bind_pprev; + *(tw->bind_pprev) = tw->bind_next; + tw->tb = NULL; + if (tb->owners == NULL) { + if (tb->next) + tb->next->pprev = tb->pprev; + *(tb->pprev) = tb->next; + kmem_cache_free(tcp_bucket_cachep, tb); + } + } + spin_unlock(&bhead->lock); - /* Ok, now free it up. */ - kmem_cache_free(tcp_timewait_cachep, tw); +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&tw->refcnt) != 1) { + printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt)); + } +#endif + tcp_tw_put(tw); } /* We come here as a special case from the AF specific TCP input processing, @@ -949,9 +1127,36 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) * entire timeout period. The only special cases are for BSD TIME_WAIT * reconnects and SYN/RST bits being set in the TCP header. */ -int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) + +/* + * * Main purpose of TIME-WAIT state is to close connection gracefully, + * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN + * (and, probably, tail of data) and one or more our ACKs are lost. + * * What is TIME-WAIT timeout? It is associated with maximal packet + * lifetime in the internet, which results in wrong conclusion, that + * it is set to catch "old duplicate segments" wandering out of their path. + * It is not quite correct. This timeout is calculated so that it exceeds + * maximal retransmision timeout enough to allow to lose one (or more) + * segments sent by peer and our ACKs. This time may be calculated from RTO. + * * When TIME-WAIT socket receives RST, it means that another end + * finally closed and we are allowed to kill TIME-WAIT too. + * * Second purpose of TIME-WAIT is catching old duplicate segments. + * Well, certainly it is pure paranoia, but if we load TIME-WAIT + * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. + * * If we invented some more clever way to catch duplicates + * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. + * + * The algorithm below is based on FORMAL INTERPRETATION of RFCs. + * When you compare it to RFCs, please, read section SEGMENT ARRIVES + * from the very beginning. + */ +enum tcp_tw_status +tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, unsigned len) { + struct tcp_opt tp; + int paws_reject = 0; + /* RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to @@ -965,58 +1170,101 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ - if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) { - struct sock *sk; - struct tcp_func *af_specific = tw->af_specific; - __u32 isn; - int ret; - - isn = tw->rcv_nxt + 128000; - if(isn == 0) - isn++; - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - sk = af_specific->get_sock(skb, th); - if(sk == NULL || - !ipsec_sk_policy(sk,skb)) - return 0; - bh_lock_sock(sk); + tp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { + tcp_parse_options(NULL, th, &tp, 0); + + paws_reject = tp.saw_tstamp && + ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 && + xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS); + } + + if (!paws_reject && + (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) { + /* In window segment, it may be only reset or bare ack. */ - /* Default is to discard the frame. */ - ret = 0; + if (th->rst) { +#ifdef CONFIG_TCP_TW_RECYCLE + /* When recycling, always follow rfc1337, + * but mark bucket as ready to recycling immediately. + */ + if (sysctl_tcp_tw_recycle) { + /* May kill it now. */ + tw->rto = 0; + tw->ttd = jiffies; + } else +#endif + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if(sysctl_tcp_rfc1337 == 0) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + } + } else { + tcp_tw_reschedule(tw); + } + + if (tp.saw_tstamp) { + tw->ts_recent = tp.rcv_tsval; + tw->ts_recent_stamp = xtime.tv_sec; + } + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* Out of window segment. - if(sk->lock.users) - goto out_unlock; + All the segments are ACKed immediately. - skb_set_owner_r(skb, sk); - af_specific = sk->tp_pinfo.af_tcp.af_specific; + The only exception is new SYN. We accept it, if it is + not old duplicate and we are not in danger to be killed + by delayed old duplicates. RFC check is that it has + newer sequence number works at rates <40Mbit/sec. + However, if paws works, it is reliable AND even more, + we even may relax silly seq space cutoff. - if(af_specific->conn_request(sk, skb, isn) < 0) - ret = 1; /* Toss a reset back. */ - out_unlock: - bh_unlock_sock(sk); - return ret; + RED-PEN: we violate main RFC requirement, if this SYN will appear + old duplicate (i.e. we receive RST in reply to SYN-ACK), + we must return socket to time-wait state. It is not good, + but not fatal yet. + */ + + if (th->syn && !th->rst && !th->ack && !paws_reject && + (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || + (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) { + u32 isn = tw->snd_nxt + 2; + if (isn == 0) + isn++; + TCP_SKB_CB(skb)->when = isn; + return TCP_TW_SYN; } - /* Check RST or SYN */ - if(th->rst || th->syn) { - /* This is TIME_WAIT assasination, in two flavors. - * Oh well... nobody has a sufficient solution to this - * protocol bug yet. + if(!th->rst) { + /* In this case we must reset the TIMEWAIT timer. + + If it is ACKless SYN it may be both old duplicate + and new good SYN with random sequence number <rcv_nxt. + Do not reschedule in the last case. */ - if(sysctl_tcp_rfc1337 == 0) { - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - } - if(!th->rst) - return 1; /* toss a reset back */ - } else { - /* In this case we must reset the TIMEWAIT timer. */ - if(th->ack) + if (paws_reject || th->ack) { tcp_tw_reschedule(tw); +#ifdef CONFIG_TCP_TW_RECYCLE + tw->rto = min(120*HZ, tw->rto<<1); + tw->ttd = jiffies + tw->rto; +#endif + } + + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK; } - return 0; /* Discard the frame. */ + tcp_tw_put(tw); + return TCP_TW_SUCCESS; } /* Enter the time wait state. This is always called from BH @@ -1024,37 +1272,54 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * relevant info into it from the SK, and mess with hash chains * and list linkage. */ -static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { + struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent]; + struct tcp_bind_hashbucket *bhead; struct sock **head, *sktw; - /* Step 1: Remove SK from established hash. */ - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - tcp_reg_zap(sk); - - /* Step 2: Put TW into bind hash where SK was. */ - tw->tb = (struct tcp_bind_bucket *)sk->prev; - if((tw->bind_next = sk->bind_next) != NULL) - sk->bind_next->bind_pprev = &tw->bind_next; - tw->bind_pprev = sk->bind_pprev; - *sk->bind_pprev = (struct sock *)tw; - sk->prev = NULL; + write_lock(&ehead->lock); - /* Step 3: Un-charge protocol socket in-use count. */ - sk->prot->inuse--; + /* Step 1: Remove SK from established hash. */ + if (sk->pprev) { + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + } - /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ - head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)]; + /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ + head = &(ehead + tcp_ehash_size)->chain; sktw = (struct sock *)tw; if((sktw->next = *head) != NULL) (*head)->pprev = &sktw->next; *head = sktw; sktw->pprev = head; + atomic_inc(&tw->refcnt); + + write_unlock(&ehead->lock); + + /* Step 3: Put TW into bind hash. Original socket stays there too. + Note, that any socket with sk->num!=0 MUST be bound in binding + cache, even if it is closed. + */ + bhead = &tcp_bhash[tcp_bhashfn(sk->num)]; + spin_lock(&bhead->lock); + tw->tb = (struct tcp_bind_bucket *)sk->prev; + BUG_TRAP(sk->prev!=NULL); + if ((tw->bind_next = tw->tb->owners) != NULL) + tw->tb->owners->bind_pprev = &tw->bind_next; + tw->tb->owners = (struct sock*)tw; + tw->bind_pprev = &tw->tb->owners; + spin_unlock(&bhead->lock); + + /* Step 4: Un-charge protocol socket in-use count. */ + sk->prot->inuse--; } +/* + * Move a socket to time-wait. + */ void tcp_time_wait(struct sock *sk) { struct tcp_tw_bucket *tw; @@ -1071,8 +1336,16 @@ void tcp_time_wait(struct sock *sk) tw->dport = sk->dport; tw->family = sk->family; tw->reuse = sk->reuse; + tw->hashent = sk->hashent; tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; + tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; + tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent; + tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp; +#ifdef CONFIG_TCP_TW_RECYCLE + tw->rto = sk->tp_pinfo.af_tcp.rto; + tw->ttd = jiffies + 2*tw->rto; +#endif + atomic_set(&tw->refcnt, 0); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if(tw->family == PF_INET6) { @@ -1085,9 +1358,7 @@ void tcp_time_wait(struct sock *sk) } #endif /* Linkage updates. */ - SOCKHASH_LOCK_WRITE(); - tcp_tw_hashdance(sk, tw); - SOCKHASH_UNLOCK_WRITE(); + __tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ tcp_tw_schedule(tw); @@ -1096,8 +1367,6 @@ void tcp_time_wait(struct sock *sk) if(sk->state == TCP_ESTABLISHED) tcp_statistics.TcpCurrEstab--; sk->state = TCP_CLOSE; - net_reset_timer(sk, TIME_DONE, - min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); } else { /* Sorry, we're out of memory, just CLOSE this * socket up. We've got bigger problems than @@ -1106,10 +1375,9 @@ void tcp_time_wait(struct sock *sk) tcp_set_state(sk, TCP_CLOSE); } - /* Prevent rcvmsg/sndmsg calls, and wake people up. */ - sk->shutdown = SHUTDOWN_MASK; - if(!sk->dead) - sk->state_change(sk); + tcp_update_metrics(sk); + tcp_clear_xmit_timers(sk); + tcp_done(sk); } /* @@ -1134,7 +1402,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) tcp_send_ack(sk); if (!sk->dead) { - sk->state_change(sk); + wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket, 1); } @@ -1143,8 +1411,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - if (th->rst) - sk->shutdown = SHUTDOWN_MASK; break; case TCP_CLOSE_WAIT: @@ -1161,12 +1427,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. - * - * This causes a WRITE timeout, which will either - * move on to TIME_WAIT when we timeout, or resend - * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already - * correct for handling this timeout. */ tcp_set_state(sk, TCP_CLOSING); break; @@ -1423,7 +1683,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Turn on fast path. */ if (skb_queue_len(&tp->out_of_order_queue) == 0) tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - (0x10 << 16) | + ntohl(TCP_FLAG_ACK) | tp->snd_wnd); return; } @@ -1545,8 +1805,8 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) * Now tell the user we may have some data. */ if (!sk->dead) { - SOCK_DEBUG(sk, "Data wakeup.\n"); - sk->data_ready(sk,0); + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1); } return(1); } @@ -1575,28 +1835,59 @@ static __inline__ void tcp_data_snd_check(struct sock *sk) /* * Adapt the MSS value used to make delayed ack decision to the - * real world. + * real world. + * + * The constant 536 hasn't any good meaning. In IPv4 world + * MTU may be smaller, though it contradicts to RFC1122, which + * states that MSS must be at least 536. + * We use the constant to do not ACK each second + * packet in a stream of tiny size packets. + * It means that super-low mtu links will be aggressively delacked. + * Seems, it is even good. If they have so low mtu, they are weirdly + * slow. + * + * AK: BTW it may be useful to add an option to lock the rcv_mss. + * this way the beowulf people wouldn't need ugly patches to get the + * ack frequencies they want and it would be an elegant way to tune delack. */ static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len = skb->len, lss; + unsigned int len, lss; - if (len > tp->rcv_mss) - tp->rcv_mss = len; lss = tp->last_seg_size; tp->last_seg_size = 0; - if (len >= 536) { - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->rcv_mss) { + tp->rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len = skb->tail - skb->h.raw; + if (len >= 536 + sizeof(struct tcphdr)) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + if (len == lss) + tp->rcv_mss = len; + tp->last_seg_size = len; + } } } /* * Check if sending an ack is needed. */ -static __inline__ void __tcp_ack_snd_check(struct sock *sk) +static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1621,12 +1912,12 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) /* We entered "quick ACK" mode or... */ tcp_in_quickack_mode(tp) || /* We have out of order data */ - (skb_peek(&tp->out_of_order_queue) != NULL)) { + (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) { /* Then ack it now */ tcp_send_ack(sk); } else { /* Else, send delayed ack. */ - tcp_send_delayed_ack(tp, HZ/2); + tcp_send_delayed_ack(sk, HZ/2); } } @@ -1637,7 +1928,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* We sent a data segment already. */ return; } - __tcp_ack_snd_check(sk); + __tcp_ack_snd_check(sk, 1); } @@ -1767,6 +2058,13 @@ static int prune_queue(struct sock *sk) * complex for anyones sanity. So we don't do it anymore. But * if we are really having our buffer space abused we stop accepting * new receive data. + * + * FIXME: it should recompute SACK state and only remove enough + * buffers to get into bounds again. The current scheme loses + * badly sometimes on links with large RTT, especially when + * the driver has high overhead per skb. + * (increasing the rcvbuf is not enough because it inflates the + * the window too, disabling flow control effectively) -AK */ if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) return 0; @@ -1782,7 +2080,7 @@ static int prune_queue(struct sock *sk) * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. - * - Out of order segments arrived. + * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received @@ -1790,6 +2088,7 @@ static int prune_queue(struct sock *sk) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) + * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. @@ -1801,12 +2100,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued; - u32 flg; /* * Header prediction. - * The code follows the one in the famous + * The code losely follows the one in the famous * "30 instruction TCP receive" Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue @@ -1819,39 +2116,63 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - /* - * RFC1323: H1. Apply PAWS check first. - */ - if (tcp_fast_parse_options(sk, th, tp)) { - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - } - } - flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16); + /* RED-PEN. Using static variables to pass function arguments + * cannot be good idea... + */ + tp->saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 else it will be !0 - * (when there are holes in the receive + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive * space for instance) * PSH flag is ignored. - */ + */ + + if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + int tcp_header_len = th->doff*4; + + /* Timestamp header prediction */ + + /* Non-standard header f.e. SACKs -> slow path */ + if (tcp_header_len != tp->tcp_header_len) + goto slow_path; + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + __u32 *ptr = (__u32 *)(th + 1); - if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (len <= th->doff*4) { + /* No? Slow path! */ + if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + goto slow_path; + + tp->saw_tstamp = 1; + ++ptr; + tp->rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rcv_tsecr = ntohl(*ptr); + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0) + goto slow_path; + + /* Predicted packet is in window by definition. + seq == rcv_nxt and last_ack_sent <= rcv_nxt. + Hence, check seq<=last_ack_sent reduces to: + */ + if (tp->rcv_nxt == tp->last_ack_sent) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; + } + } + + if (len <= tcp_header_len) { /* Bulk data transfer: sender */ - if (len == th->doff*4) { + if (len == tcp_header_len) { tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); @@ -1864,12 +2185,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { /* Bulk data transfer: receiver */ - __skb_pull(skb,th->doff*4); + __skb_pull(skb,tcp_header_len); + /* Is it possible to simplify this? */ tcp_measure_rcv_mss(sk, skb); /* DO NOT notify forward progress here. * It saves dozen of CPU instructions in fast path. --ANK + * And where is it signaled then ? -AK */ __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -1877,14 +2200,37 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* FIN bit check is not done since if FIN is set in * this frame, the pred_flags won't match up. -DaveM */ - sk->data_ready(sk, 0); + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1); tcp_delack_estimator(tp); tcp_remember_ack(tp, th, skb); - __tcp_ack_snd_check(sk); + __tcp_ack_snd_check(sk, 0); return 0; } + /* Packet is in sequence, flags are trivial; + * only ACK is strange or we are tough on memory. + * Jump to step 5. + */ + goto step5; + } + +slow_path: + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + tcp_send_ack(sk); + goto discard; + } + /* Resets are accepted even if PAWS failed. + + ts_recent update must be made after we are sure + that the packet is in window. + */ } /* @@ -1909,44 +2255,34 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto discard; } + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + if (tp->saw_tstamp) { + tcp_replace_ts_recent(sk, tp, + TCP_SKB_CB(skb)->seq); + } + if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; tcp_reset(sk); return 1; } - - if(th->rst) { - tcp_reset(sk); - goto discard; - } +step5: if(th->ack) tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); /* Process urgent data. */ tcp_urg(sk, th, len); + { /* step 7: process the segment text */ - queued = tcp_data(skb, sk, len); + int queued = tcp_data(skb, sk, len); - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - * - * Dave!!! Phrase above (and all about rcv_mss) has - * nothing to do with reality. rcv_mss must measure TOTAL - * size, including sacks, IP options etc. Hence, measure_rcv_mss - * must occure before pulling etc, otherwise it will flap - * like hell. Even putting it before tcp_data is wrong, - * it should use skb->tail - skb->nh.raw instead. - * --ANK (980805) - * - * BTW I broke it. Now all TCP options are handled equally - * in mss_clamp calculations (i.e. ignored, rfc1122), - * and mss_cache does include all of them (i.e. tstamps) - * except for sacks, to calulate effective mss faster. - * --ANK (980805) - */ tcp_measure_rcv_mss(sk, skb); /* Be careful, tcp_data() may have put this into TIME_WAIT. */ @@ -1959,76 +2295,541 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, discard: kfree_skb(skb); } + } return 0; } + +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * Actually, we could lots of memory writes here. tp of listening + * socket contains all necessary default parameters. + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); + + if(newsk != NULL) { + struct tcp_opt *newtp; +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->state = TCP_SYN_RECV; + + /* SANITY */ + newsk->pprev = NULL; + newsk->prev = NULL; + + /* Clone the TCP header template */ + newsk->dport = req->rmt_port; + + sock_lock_init(newsk); + + atomic_set(&newsk->rmem_alloc, 0); + skb_queue_head_init(&newsk->receive_queue); + atomic_set(&newsk->wmem_alloc, 0); + skb_queue_head_init(&newsk->write_queue); + atomic_set(&newsk->omem_alloc, 0); + + newsk->done = 0; + newsk->proc = 0; + newsk->backlog.head = newsk->backlog.tail = NULL; + skb_queue_head_init(&newsk->error_queue); + newsk->write_space = tcp_write_space; +#ifdef CONFIG_FILTER + if ((filter = newsk->filter) != NULL) + sk_filter_charge(newsk, filter); +#endif + + /* Now setup tcp_opt */ + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->srtt = 0; + newtp->ato = 0; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = req->snt_isn; + + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + newtp->snd_wnd = ntohs(skb->h.th->window); + + newtp->max_window = newtp->snd_wnd; + newtp->pending = 0; + newtp->retransmits = 0; + newtp->last_ack_sent = req->rcv_isn + 1; + newtp->backoff = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + + newtp->rto = TCP_TIMEOUT_INIT; + newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; + newtp->high_seq = 0; + newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_cwnd_cnt = 0; + newtp->dup_acks = 0; + newtp->delayed_acks = 0; + init_timer(&newtp->retransmit_timer); + newtp->retransmit_timer.function = &tcp_retransmit_timer; + newtp->retransmit_timer.data = (unsigned long) newsk; + init_timer(&newtp->delack_timer); + newtp->delack_timer.function = &tcp_delack_timer; + newtp->delack_timer.data = (unsigned long) newsk; + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->send_head = newtp->retrans_head = NULL; + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->saw_tstamp = 0; + + init_timer(&newtp->probe_timer); + newtp->probe_timer.function = &tcp_probe_timer; + newtp->probe_timer.data = (unsigned long) newsk; + newtp->probes_out = 0; + newtp->syn_seq = req->rcv_isn; + newtp->fin_seq = req->rcv_isn; + newtp->urg_data = 0; + tcp_synq_init(newtp); + newtp->syn_backlog = 0; + if (skb->len >= 536) + newtp->last_seg_size = skb->len; + + /* Back to base struct sock members. */ + newsk->err = 0; + newsk->ack_backlog = 0; + newsk->max_ack_backlog = SOMAXCONN; + newsk->priority = 0; + atomic_set(&newsk->refcnt, 1); + atomic_inc(&inet_sock_nr); + + spin_lock_init(&sk->timer_lock); + init_timer(&newsk->timer); + newsk->timer.function = &tcp_keepalive_timer; + newsk->timer.data = (unsigned long) newsk; + if (newsk->keepopen) + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); + newsk->socket = NULL; + newsk->sleep = NULL; + + newtp->tstamp_ok = req->tstamp_ok; + if((newtp->sack_ok = req->sack_ok) != 0) + newtp->num_sacks = 0; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; + newtp->wscale_ok = req->wscale_ok; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } + if (newtp->tstamp_ok) { + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = xtime.tv_sec; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->mss_clamp = req->mss; + } + return newsk; +} + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + + /* - * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented - * as an open_request. + * Process an incoming packet for SYN_RECV sockets represented + * as an open_request. */ -struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - struct open_request *req) +struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, + struct open_request *req, + struct open_request *prev) { + struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 flg; + u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + int paws_reject = 0; + struct tcp_opt ttp; - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. + /* If socket has already been created, process + packet in its context. + + We fall here only due to race, when packets were enqueued + to backlog of listening socket. */ + if (req->sk) + return req->sk; - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); - - flg &= __constant_htonl(0x00170000); - /* Only SYN set? */ - if (flg == __constant_htonl(0x00020000)) { - if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) { - /* retransmited syn. + ttp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + + tcp_parse_options(NULL, th, &ttp, 0); + + paws_reject = ttp.saw_tstamp && + (s32)(ttp.rcv_tsval - req->ts_recent) < 0; + } + + /* Check for pure retransmited SYN. */ + if (TCP_SKB_CB(skb)->seq == req->rcv_isn && + flg == TCP_FLAG_SYN && + !paws_reject) { + /* + * RFC793 draws (Incorrectly! It was fixed in RFC1122) + * this case on figure 6 and figure 8, but formal + * protocol description says NOTHING. + * To be more exact, it says that we should send ACK, + * because this segment (at least, if it has no data) + * is out of window. + * + * CONCLUSION: RFC793 (even with RFC1122) DOES NOT + * describe SYN-RECV state. All the description + * is wrong, we cannot believe to it and should + * rely only on common sense and implementation + * experience. + * + * Enforce "SYN-ACK" according to figure 8, figure 6 + * of RFC793, fixed by RFC1122. + */ + req->class->rtx_syn_ack(sk, req); + return NULL; + } + + /* Further reproduces section "SEGMENT ARRIVES" + for state SYN-RECEIVED of RFC793. + It is broken, however, it does not work only + when SYNs are crossed, which is impossible in our + case. + + But generally, we should (RFC lies!) to accept ACK + from SYNACK both here and in tcp_rcv_state_process(). + tcp_rcv_state_process() does not, hence, we do not too. + + Note that the case is absolutely generic: + we cannot optimize anything here without + violating protocol. All the checks must be made + before attempt to create socket. + */ + + /* RFC793: "first check sequence number". */ + + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { + /* Out of window: send ACK and drop. */ + if (!(flg & TCP_FLAG_RST)) + req->class->send_ack(skb, req); + return NULL; + } + + /* In sequence, PAWS is OK. */ + + if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) + req->ts_recent = ttp.rcv_tsval; + + if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { + /* Truncate SYN, it is out of window starting + at req->rcv_isn+1. */ + flg &= ~TCP_FLAG_SYN; + } + + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) + goto embryonic_reset; + + /* RFC793: "fifth check the ACK field" */ + + if (!(flg & TCP_FLAG_ACK)) + return NULL; + + /* Invalid ACK: reset will be sent by listening socket */ + if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) + return sk; + + /* OK, ACK is valid, create big socket and + feed this segment to it. It will repeat all + the tests. THIS SEGMENT MUST MOVE SOCKET TO + ESTABLISHED STATE. If it will be dropped after + socket is created, wait for troubles. + */ + sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (sk == NULL) + return NULL; + + tcp_dec_slow_timer(TCP_SLT_SYNACK); + req->sk = sk; + return sk; + +embryonic_reset: + tcp_synq_unlink(tp, req, prev); + tp->syn_backlog--; + tcp_dec_slow_timer(TCP_SLT_SYNACK); + + net_statistics.EmbryonicRsts++; + if (!(flg & TCP_FLAG_RST)) + req->class->send_reset(skb); + + req->class->destructor(req); + tcp_openreq_free(req); + return NULL; +} + +static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_parse_options(sk, th, tp, 0); + +#ifdef CONFIG_TCP_TW_RECYCLE + if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst && + (s32)(tp->rcv_tsval - tp->ts_recent) < 0 && + xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) { + /* Old duplicate segment. We remember last + ts_recent from this host in timewait bucket. + + Actually, we could implement per host cache + to truncate timewait state after RTO. Paranoidal arguments + of rfc1337 are not enough to close this nice possibility. + */ + if (net_ratelimit()) + printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n"); + if (th->ack) + return 1; + goto discard; + } +#endif + + if (th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * I cite this place to emphasize one essential + * detail, this check is different of one + * in established state: SND.UNA <= SEG.ACK <= SND.NXT. + * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, + * because we have no previous data sent before SYN. + * --ANK(990513) + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (sk->zapped || + TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + return 1; + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + if (!th->syn) + goto discard; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + * + * Do you see? SYN-less ACKs in SYN-SENT state are + * completely ignored. + * + * The bug causing stalled SYN-SENT sockets + * was here: tcp_ack advanced snd_una and canceled + * retransmit timer, so that bare ACK received + * in SYN-SENT state (even with invalid ack==ISS, + * because tcp_ack check is too weak for SYN-SENT) + * causes moving socket to invalid semi-SYN-SENT, + * semi-ESTABLISHED state and connection hangs. + * + * There exist buggy stacks, which really send + * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) + * Actually, if this host did not try to get something + * from ftp.inr.ac.ru I'd never find this bug 8) + * + * --ANK (990514) + * + * I was wrong, I apologize. Bare ACK is valid. + * Actually, RFC793 requires to send such ACK + * in reply to any out of window packet. + * It is wrong, but Linux also does it sometimes. + * --ANK (990724) + */ + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + tp->fin_seq = TCP_SKB_CB(skb)->seq; + + tcp_set_state(sk, TCP_ESTABLISHED); + + if (tp->wscale_ok == 0) { + tp->snd_wscale = tp->rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp,65535); + } + + if (tp->tstamp_ok) { + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else + tp->tcp_header_len = sizeof(struct tcphdr); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; + } + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + tcp_init_metrics(sk); + + if (tp->write_pending) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * How to make this correctly? */ - req->class->rtx_syn_ack(sk, req); - return NULL; + tp->delayed_acks++; + if (tp->ato == 0) + tp->ato = tp->rto; + tcp_send_delayed_ack(sk, tp->rto); } else { - return sk; /* Pass new SYN to the listen socket. */ + tcp_send_ack(sk); } + + tp->copied_seq = tp->rcv_nxt; + + if(!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 0); + } + return -1; } - /* We know it's an ACK here */ - if (req->sk) { - /* socket already created but not - * yet accepted()... + /* No ACK in the segment */ + + if (th->rst) { + /* rfc793: + * "If the RST bit is set + * + * Otherwise (no ACK) drop the segment and return." */ - sk = req->sk; - } else { - /* In theory the packet could be for a cookie, but - * TIME_WAIT should guard us against this. - * XXX: Nevertheless check for cookies? - * This sequence number check is done again later, - * but we do it here to prevent syn flood attackers - * from creating big SYN_RECV sockets. - */ - if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) || - !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, - req->rcv_isn+1+req->rcv_wnd)) { - req->class->send_reset(skb); - return NULL; + + goto discard; + } + + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * + * The previous version of the code + * checked for "connecting to self" + * here. that check is done now in + * tcp_connect. + * + * RED-PEN: BTW, it does not. 8) + */ + tcp_set_state(sk, TCP_SYN_RECV); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; } - - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - if (sk == NULL) - return NULL; - - req->expires = 0UL; - req->sk = sk; + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + tcp_send_synack(sk); +#if 0 + /* Note, we could accept data and URG from this segment. + * There are no obstacles to make this. + * + * However, if we ignore data in ACKless segments sometimes, + * we have no reasons to accept it sometimes. + * Also, seems the code doing it in step6 of tcp_rcv_state_process + * is not flawless. So, discard packet for sanity. + * Uncomment this return to process the data. + */ + return -1; +#endif } - skb_orphan(skb); - skb_set_owner_r(skb, sk); - return sk; + /* "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + */ + +discard: + kfree_skb(skb); + return 0; } + /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -2042,6 +2843,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; + tp->saw_tstamp = 0; + switch (sk->state) { case TCP_CLOSE: /* When state == CLOSED, hash lookup always fails. @@ -2061,35 +2864,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * a TCP_CLOSE socket does not exist. Drop the frame * and send a RST back to the other end. */ - return 1; - case TCP_LISTEN: - /* These use the socket TOS.. - * might want to be the received TOS + /* 1. The socket may be moved to TIME-WAIT state. + 2. While this socket was locked, another socket + with the same identity could be created. + 3. To continue? + + CONCLUSION: discard and only discard! + + Alternative would be relookup and recurse into tcp_v?_rcv + (not *_do_rcv) to work with timewait and listen states + correctly. */ - if(th->ack) { - struct sock *realsk; - int ret; + goto discard; - realsk = tp->af_specific->get_sock(skb, th); - if(realsk == sk) - return 1; + case TCP_LISTEN: + if(th->ack) + return 1; - bh_lock_sock(realsk); - ret = 0; - if(realsk->lock.users != 0) { - skb_orphan(skb); - sk_add_backlog(realsk, skb); - } else { - ret = tcp_rcv_state_process(realsk, skb, - skb->h.th, skb->len); - } - bh_unlock_sock(realsk); - return ret; - } - if(th->syn) { - if(tp->af_specific->conn_request(sk, skb, 0) < 0) + if(tp->af_specific->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -2110,172 +2904,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ goto discard; } - goto discard; - break; case TCP_SYN_SENT: - /* SYN sent means we have to look for a suitable ack and - * either reset for bad matches or go to connected. - * The SYN_SENT case is unusual and should - * not be in line code. [AC] - */ - if(th->ack) { - /* rfc793: - * "If the state is SYN-SENT then - * first check the ACK bit - * If the ACK bit is set - * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send - * a reset (unless the RST bit is set, if so drop - * the segment and return)" - * - * I cite this place to emphasize one essential - * detail, this check is different of one - * in established state: SND.UNA <= SEG.ACK <= SND.NXT. - * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, - * because we have no previous data sent before SYN. - * --ANK(990513) - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: - */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) - return 1; - - /* Now ACK is acceptable. - * - * "If the RST bit is set - * If the ACK was acceptable then signal the user "error: - * connection reset", drop the segment, enter CLOSED state, - * delete TCB, and return." - */ - - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - /* rfc793: - * "fifth, if neither of the SYN or RST bits is set then - * drop the segment and return." - * - * See note below! - * --ANK(990513) - */ - - if (!th->syn) - goto discard; - - /* rfc793: - * "If the SYN bit is on ... - * are acceptable then ... - * (our SYN has been ACKed), change the connection - * state to ESTABLISHED..." - * - * Do you see? SYN-less ACKs in SYN-SENT state are - * completely ignored. - * - * The bug causing stalled SYN-SENT sockets - * was here: tcp_ack advanced snd_una and canceled - * retransmit timer, so that bare ACK received - * in SYN-SENT state (even with invalid ack==ISS, - * because tcp_ack check is too weak for SYN-SENT) - * causes moving socket to invalid semi-SYN-SENT, - * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * - * --ANK (990514) - */ - - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - - /* Ok.. it's good. Set up sequence numbers and - * move to established. - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; - tp->fin_seq = TCP_SKB_CB(skb)->seq; - - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_parse_options(sk, th, tp, 0); - - if (tp->wscale_ok == 0) { - tp->snd_wscale = tp->rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp,65535); - } - - if (tp->tstamp_ok) { - tp->tcp_header_len = - sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else - tp->tcp_header_len = sizeof(struct tcphdr); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - /* Can't be earlier, doff would be wrong. */ - tcp_send_ack(sk); - - sk->dport = th->source; - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - sk->state_change(sk); - sock_wake_async(sk->socket, 0); - } - } else { - if(th->syn && !th->rst) { - /* The previous version of the code - * checked for "connecting to self" - * here. that check is done now in - * tcp_connect. - */ - tcp_set_state(sk, TCP_SYN_RECV); - tcp_parse_options(sk, th, tp, 0); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - - tcp_send_synack(sk); - } else - break; - } - - /* tp->tcp_header_len and tp->mss_clamp - probably changed, synchronize mss. - */ - tcp_sync_mss(sk, tp->pmtu_cookie); - tp->rcv_mss = tp->mss_cache; - - if (sk->state == TCP_SYN_RECV) - goto discard; - - goto step6; + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (queued >= 0) + return queued; + queued = 0; + goto step6; } /* Parse the tcp_options present on this header. @@ -2283,23 +2919,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Note that this really has to be here and not later for PAWS * (RFC1323) to work. */ - if (tcp_fast_parse_options(sk, th, tp)) { - /* NOTE: assumes saw_tstamp is never set if we didn't - * negotiate the option. tcp_fast_parse_options() must - * guarantee this. - */ - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); + if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + tcp_send_ack(sk); + goto discard; } + /* Reset is accepted even if it did not pass PAWS. */ } /* The silly FIN test here is necessary to see an advancing ACK in @@ -2313,11 +2939,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * At this point the connection will deadlock with host1 believing * that his FIN is never ACK'd, and thus it will retransmit it's FIN * forever. The following fix is from Taral (taral@taral.net). + * + * RED-PEN. Seems, the above is not true. + * If at least one end is RFC compliant, it will send ACK to + * out of window FIN and, hence, move peer to TIME-WAIT. + * I comment out this line. --ANK + * + * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs + * received in SYN-RECV. The problem is that description of + * segment processing in SYN-RECV state in RFC792 is WRONG. + * Correct check would accept ACK from this SYN-ACK, see + * figures 6 and 8 (fixed by RFC1122). Compare this + * to problem with FIN, they smell similarly. --ANK */ /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) && - !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) { + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) +#if 0 + && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) +#endif + ) { if (!th->rst) { tcp_send_ack(sk); } @@ -2330,6 +2971,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } + if (tp->saw_tstamp) { + tcp_replace_ts_recent(sk, tp, + TCP_SKB_CB(skb)->seq); + } + /* step 3: check security and precedence [ignored] */ /* step 4: @@ -2357,22 +3003,36 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (th->ack) { int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); - + switch(sk->state) { case TCP_SYN_RECV: if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); - sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; - if(!sk->dead) - sk->state_change(sk); + /* Note, that this wakeup is only for marginal + crossed SYN case. Passively open sockets + are not waked up, because sk->sleep == NULL + and sk->socket == NULL. + */ + if (!sk->dead && sk->sleep) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 1); + } tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = htons(th->window) << tp->snd_wscale; tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + /* tcp_ack considers this ACK as duplicate + * and does not calculate rtt. It is wrong. + * Fix it at least with timestamps. + */ + if (tp->saw_tstamp && !tp->srtt) + tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED); + + tcp_init_metrics(sk); } else { SOCK_DEBUG(sk, "bad ack\n"); return 1; @@ -2386,7 +3046,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (!sk->dead) sk->state_change(sk); else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); + tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); + dst_confirm(sk->dst_cache); } break; @@ -2399,10 +3060,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { - sk->shutdown = SHUTDOWN_MASK; tcp_set_state(sk,TCP_CLOSE); - if (!sk->dead) - sk->state_change(sk); + tcp_update_metrics(sk); + tcp_done(sk); goto discard; } break; @@ -2444,8 +3104,11 @@ step6: break; } - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); + /* tcp_data could move socket to TIME-WAIT */ + if (sk->state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } if (!queued) { discard: |