diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-02-18 00:24:27 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-02-18 00:24:27 +0000 |
commit | b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch) | |
tree | 707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4/tcp_input.c | |
parent | b3ac367c7a3e6047abe74817db27e34e759f279f (diff) |
Merge with Linux 2.3.41.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1370 |
1 files changed, 902 insertions, 468 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b4ae64a2..d61a5df02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $ + * Version: $Id: tcp_input.c,v 1.183 2000/01/24 18:40:33 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -70,9 +70,6 @@ #define SYNC_INIT 1 #endif -extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_time; - /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM */ @@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; -int sysctl_tcp_tw_recycle; +int sysctl_tcp_tw_recycle = 1; +int sysctl_tcp_abort_on_overflow = 0; +int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; static int prune_queue(struct sock *sk); +/* + * Adapt the MSS value used to make delayed ack decision to the + * real world. + * + * The constant 536 hasn't any good meaning. In IPv4 world + * MTU may be smaller, though it contradicts to RFC1122, which + * states that MSS must be at least 536. + * We use the constant to do not ACK each second + * packet in a stream of tiny size packets. + * It means that super-low mtu links will be aggressively delacked. + * Seems, it is even good. If they have so low mtu, they are weirdly + * slow. + * + * AK: BTW it may be useful to add an option to lock the rcv_mss. + * this way the beowulf people wouldn't need ugly patches to get the + * ack frequencies they want and it would be an elegant way to tune delack. + */ +static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb) +{ + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len = skb->tail - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + if (len == lss) + tp->ack.rcv_mss = len; + tp->ack.last_seg_size = len; + } + +#if 0 + /* Tiny-grams with PSH set artifically deflate our + * ato measurement. + * + * Mmm... I copied this test from tcp_remember_ack(), but + * I did not understand this. Is it to speedup nagling sender? + * It does not because classic (non-Minshall) sender nagles + * guided by not-acked frames not depending on size. + * And it does not help NODELAY sender, because latency + * is too high in any case. The only result is timer trashing + * and redundant ACKs. Grr... Seems, I missed something. --ANK + * + * Let me to comment out this yet... TCP should work + * perfectly without this. --ANK + */ + if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh) + tp->ack.ato = TCP_ATO_MIN; +#endif + } +} + + +static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp) +{ + unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss); + + tp->ack.quick = max(min(quickacks, 127), 1); + + if (!tp->tstamp_ok && tp->ack.quick>2) { + /* Quick ACKs are _dangerous_, if RTTM is not used. + * See comment in tcp_init_metrics(). We still help + * them to overcome the most difficult, initial + * phase of slow start. + */ + tp->ack.quick = 2; + } +} + +/* Send ACKs quickly, if "quick" count is not ehausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) +{ + return (tp->ack.quick && !tp->ack.pingpong); +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk); * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_delack_estimator(struct tcp_opt *tp) +static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) { - if(tp->ato == 0) { - tp->lrcvtime = tcp_time_stamp; + u32 now; - /* Help sender leave slow start quickly, - * and also makes sure we do not take this - * branch ever again for this connection. + tcp_measure_rcv_mss(tp, skb); + + tp->ack.pending = 1; + + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. */ - tp->ato = 1; + + /* Help sender leave slow start quickly. */ tcp_enter_quickack_mode(tp); + + /* Pingpong is off, session is not interactive by default */ + tp->ack.pingpong = 0; + + /* ATO is minimal */ + tp->ack.ato = TCP_ATO_MIN; } else { - int m = tcp_time_stamp - tp->lrcvtime; - - tp->lrcvtime = tcp_time_stamp; - if(m <= 0) - m = 1; - if(m > tp->rto) - tp->ato = tp->rto; - else { - /* This funny shift makes sure we - * clear the "quick ack mode" bit. + int m = now - tp->ack.lrcvtime; + + if (m > TCP_ATO_MAX/2) { + /* Do not touch ATO, if interval is out of bounds. + * It will be deflated by delack timer, if our peer + * really sends too rarely. */ - tp->ato = ((tp->ato << 1) >> 2) + m; + if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_enter_quickack_mode(tp); + } + } else { + if (m <= 0) + m = TCP_ATO_MIN/2; + tp->ack.ato = (tp->ack.ato >> 1) + m; } } + tp->ack.lrcvtime = now; } -/* - * Remember to send an ACK later. - */ -static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, - struct sk_buff *skb) -{ - tp->delayed_acks++; - - /* Tiny-grams with PSH set artifically deflate our - * ato measurement, but with a lower bound. - */ - if(th->psh && (skb->len < (tp->rcv_mss >> 1))) { - /* Preserve the quickack state. */ - if((tp->ato & 0x7fffffff) > HZ/50) - tp->ato = ((tp->ato & 0x80000000) | - (HZ/50)); - } -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) */ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { - if (tp->rto > 120*HZ) - tp->rto = 120*HZ; - if (tp->rto < HZ/5) - tp->rto = HZ/5; + if (tp->rto < TCP_RTO_MIN) + tp->rto = TCP_RTO_MIN; + else if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - if (dst) { + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { int m; if (tp->backoff || !tp->srtt) { @@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk) return; } - dst_confirm(dst); - m = dst->rtt - tp->srtt; /* If newly calculated rtt larger than stored one, @@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk) dst_confirm(dst); + if (dst->mxlock&(1<<RTAX_CWND)) + tp->snd_cwnd_clamp = dst->cwnd; + if (dst->ssthresh) { + tp->snd_ssthresh = dst->ssthresh; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst->rtt == 0) goto reset; - if (!tp->srtt || !tp->saw_tstamp) + if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst->rttvar; tcp_set_rto(tp); tcp_bound_rto(tp); - - if (dst->mxlock&(1<<RTAX_CWND)) - tp->snd_cwnd_clamp = dst->cwnd; - if (dst->ssthresh) { - tp->snd_ssthresh = dst->ssthresh; - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } + if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp); return; @@ -357,9 +454,6 @@ reset: } } -#define PAWS_24DAYS (60 * 60 * 24 * 24) - - /* WARNING: this must not be called if tp->saw_tstamp was false. */ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) @@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) */ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || - xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) { + xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = xtime.tv_sec; } @@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) { return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS + xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM @@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { u32 end_window = tp->rcv_wup + tp->rcv_wnd; +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif - if (tp->rcv_wnd && + if (rcv_wnd && after(end_seq, tp->rcv_nxt) && before(seq, end_window)) return 1; @@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif if (seq == tp->rcv_nxt) - return (tp->rcv_wnd || (end_seq == seq)); + return (rcv_wnd || (end_seq == seq)); return __tcp_sequence(tp, seq, end_seq); } @@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ static void tcp_reset(struct sock *sk) { - sk->zapped = 1; - /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_SYN_SENT: @@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk) return; default: sk->err = ECONNRESET; - }; - tcp_set_state(sk, TCP_CLOSE); - tcp_clear_xmit_timers(sk); + } + tcp_done(sk); } @@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - tp->snd_cwnd = (tp->snd_ssthresh + 3); - tp->high_seq = tp->snd_nxt; + __tcp_enter_cong_avoid(tp); + /* ... and account for 3 ACKs, which are + * already received to this time. + */ + tp->snd_cwnd += 3; + if(!tp->fackets_out) tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); else tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else if (++tp->dup_acks > 3) { /* 2. Each time another duplicate ACK arrives, increment @@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (ack != tp->snd_una && before(ack, tp->high_seq)) { tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else { /* FACK style, fill any remaining holes in @@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; } else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd @@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Our probe was answered. */ - tp->probes_out = 0; - /* Was it a usable window open? */ - /* should always be non-null */ - if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { - tp->backoff = 0; - tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - } else { - tcp_reset_xmit_timer(sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); + if (tp->send_head != NULL) { + if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* If packets_out==0, socket must be waked up by + * subsequent tcp_data_snd_check(). This function is + * not for random using! + */ + } else if (!tp->packets_out) { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } } } - + /* Should we open up the congestion window? */ static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) { @@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); +#ifdef TCP_DEBUG + /* It occured in 2.3, because of racy timers. Namely, + * retransmit timer did not check packets_out and retransmitted + * send_head sometimes and, hence, messed all the write_queue. + * Now it is impossible, I bet. --ANK + */ + if (skb == NULL) { + printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state); + return; + } +#endif + /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The * congestion window is handled properly by that code. */ if (tp->retransmits) { tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } else { __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); if ((__s32)when < 0) when = 1; - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); } } @@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 seq = 0; u32 seq_rtt = 0; - if(sk->zapped) - return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - tp->probes_out = 0; - - tp->rcv_tstamp = tcp_time_stamp; + if(sk->state == TCP_CLOSE) + return 1; /* Dead, can't ack any more so why bother */ /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, goto uninteresting_ack; /* If there is data set flag 1 */ - if (len != th->doff*4) { + if (len != th->doff*4) flag |= FLAG_DATA; - tcp_delack_estimator(tp); - } /* Update our send window. */ @@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { flag |= FLAG_WIN_UPDATE; - tp->snd_wnd = nwin; + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } tp->snd_wl1 = ack_seq; tp->snd_wl2 = ack; - - if (nwin > tp->max_window) - tp->max_window = nwin; } } + /* BEWARE! From this place and until return from this function + * snd_nxt and snd_wnd are out of sync. All the routines, called + * from here must get "ack" as argument or they should not depend + * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK + */ + /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->err_soft = 0; + tp->probes_out = 0; + tp->rcv_tstamp = tcp_time_stamp; + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tp->pending == TIME_PROBE0) + if (tcp_timer_is_set(sk, TCP_TIME_PROBE0)) tcp_ack_probe(sk, ack); - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - /* We must do this here, before code below clears out important * state contained in tp->fackets_out and tp->retransmits. -DaveM */ @@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (flag & FLAG_DATA_ACKED) tcp_ack_packets_out(sk, tp); } else { - tcp_clear_xmit_timer(sk, TIME_RETRANS); + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); } flag &= (FLAG_DATA | FLAG_WIN_UPDATE); @@ -1074,9 +1204,42 @@ uninteresting_ack: return 0; } +int tcp_paws_check(struct tcp_opt *tp, int rst) +{ + if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) + return 0; + if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + return 0; + + /* RST segments are not recommended to carry timestamp, + and, if they do, it is recommended to ignore PAWS because + "their cleanup function should take precedence over timestamps." + Certainly, it is mistake. It is necessary to understand the reasons + of this constraint to relax it: if peer reboots, clock may go + out-of-sync and half-open connections will not be reset. + Actually, the problem would be not existing if all + the implementations followed draft about maintaining clock + via reboots. Linux-2.2 DOES NOT! + + However, we can relax time bounds for RST segments to MSL. + */ + if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) + return 0; + return 1; +} + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + /* New-style handling of TIME_WAIT sockets. */ -/* Must be called only from BH context. */ +/* Must be called with locally disabled BHs. */ void tcp_timewait_kill(struct tcp_tw_bucket *tw) { struct tcp_ehash_bucket *ehead; @@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) tcp_tw_put(tw); } -/* We come here as a special case from the AF specific TCP input processing, - * and the SKB has no owner. Essentially handling this is very simple, - * we just keep silently eating rx'd packets until none show up for the - * entire timeout period. The only special cases are for BSD TIME_WAIT - * reconnects and SYN/RST bits being set in the TCP header. - */ - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, @@ -1157,7 +1319,75 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct tcp_opt tp; int paws_reject = 0; - /* RFC 1122: + tp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { + tcp_parse_options(NULL, th, &tp, 0); + + if (tp.saw_tstamp) { + tp.ts_recent = tw->ts_recent; + tp.ts_recent_stamp = tw->ts_recent_stamp; + paws_reject = tcp_paws_check(&tp, th->rst); + } + } + + if (tw->substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->substate = TCP_TIME_WAIT; + tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp.saw_tstamp) { + tw->ts_recent_stamp = xtime.tv_sec; + tw->ts_recent = tp.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->family == AF_INET && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: @@ -1171,47 +1401,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * to be an old duplicate". */ - tp.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { - tcp_parse_options(NULL, th, &tp, 0); - - paws_reject = tp.saw_tstamp && - ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 && - xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS); - } - if (!paws_reject && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { -#ifdef CONFIG_TCP_TW_RECYCLE - /* When recycling, always follow rfc1337, - * but mark bucket as ready to recycling immediately. - */ - if (sysctl_tcp_tw_recycle) { - /* May kill it now. */ - tw->rto = 0; - tw->ttd = jiffies; - } else -#endif /* This is TIME_WAIT assasination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if(sysctl_tcp_rfc1337 == 0) { + if (sysctl_tcp_rfc1337 == 0) { +kill: tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; } - } else { - tcp_tw_reschedule(tw); } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tp.saw_tstamp) { tw->ts_recent = tp.rcv_tsval; tw->ts_recent_stamp = xtime.tv_sec; } + tcp_tw_put(tw); return TCP_TW_SUCCESS; } @@ -1235,7 +1449,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || - (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) { + (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) { u32 isn = tw->snd_nxt + 2; if (isn == 0) isn++; @@ -1243,20 +1457,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SYN; } + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); + if(!th->rst) { /* In this case we must reset the TIMEWAIT timer. - - If it is ACKless SYN it may be both old duplicate - and new good SYN with random sequence number <rcv_nxt. - Do not reschedule in the last case. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number <rcv_nxt. + * Do not reschedule in the last case. */ - if (paws_reject || th->ack) { - tcp_tw_reschedule(tw); -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = min(120*HZ, tw->rto<<1); - tw->ttd = jiffies + tw->rto; -#endif - } + if (paws_reject || th->ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -1267,8 +1479,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is always called from BH - * context. Essentially we whip up a timewait bucket, copy the +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the * relevant info into it from the SK, and mess with hash chains * and list linkage. */ @@ -1286,6 +1498,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; + sock_prot_dec_use(sk->prot); } /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ @@ -1312,41 +1525,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) tw->tb->owners = (struct sock*)tw; tw->bind_pprev = &tw->tb->owners; spin_unlock(&bhead->lock); - - /* Step 4: Un-charge protocol socket in-use count. */ - sock_prot_dec_use(sk->prot); } /* - * Move a socket to time-wait. + * Move a socket to time-wait or dead fin-wait-2 state. */ -void tcp_time_wait(struct sock *sk) +void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw; + struct tcp_tw_bucket *tw = NULL; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); if(tw != NULL) { + int rto = (tp->rto<<2) - (tp->rto>>1); + /* Give us an identity. */ tw->daddr = sk->daddr; tw->rcv_saddr = sk->rcv_saddr; tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; + tw->substate = state; tw->sport = sk->sport; tw->dport = sk->dport; tw->family = sk->family; tw->reuse = sk->reuse; - tw->hashent = sk->hashent; - tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; - tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent; - tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp; -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = sk->tp_pinfo.af_tcp.rto; - tw->ttd = jiffies + 2*tw->rto; -#endif + tw->rcv_wscale = tp->rcv_wscale; atomic_set(&tw->refcnt, 0); + tw->hashent = sk->hashent; + tw->rcv_nxt = tp->rcv_nxt; + tw->snd_nxt = tp->snd_nxt; + tw->rcv_wnd = tcp_receive_window(tp); + tw->syn_seq = tp->syn_seq; + tw->ts_recent = tp->ts_recent; + tw->ts_recent_stamp= tp->ts_recent_stamp; + tw->pprev_death = NULL; + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if(tw->family == PF_INET6) { memcpy(&tw->v6_daddr, @@ -1361,22 +1582,28 @@ void tcp_time_wait(struct sock *sk) __tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ - tcp_tw_schedule(tw); + if (timeo < rto) + timeo = rto; - /* CLOSE the SK. */ - if(sk->state == TCP_ESTABLISHED) - tcp_statistics[smp_processor_id()*2].TcpCurrEstab--; - sk->state = TCP_CLOSE; + if (recycle_ok) { + tw->timeout = rto; + } else { + tw->timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + tcp_tw_schedule(tw, timeo); } else { - /* Sorry, we're out of memory, just CLOSE this + /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - tcp_set_state(sk, TCP_CLOSE); + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); } tcp_update_metrics(sk); - tcp_clear_xmit_timers(sk); tcp_done(sk); } @@ -1397,10 +1624,13 @@ void tcp_time_wait(struct sock *sk) static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->fin_seq = TCP_SKB_CB(skb)->end_seq; tcp_send_ack(sk); + sk->shutdown |= RCV_SHUTDOWN; + switch(sk->state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: @@ -1427,7 +1657,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these @@ -1435,9 +1665,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) */ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); break; - } + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->sack_ok) + tp->num_sacks = 0; + if (!sk->dead) { - wake_up_interruptible(sk->sleep); + sk->state_change(sk); sock_wake_async(sk->socket, 1, POLL_HUP); } } @@ -1622,6 +1860,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; } + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -1658,6 +1897,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct sk_buff *skb1; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int eaten = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -1665,33 +1905,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { /* Ok. In sequence. */ - queue_and_out: + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + tp->ucopy.len && + sk->lock.users && + !tp->urg_data) { + int chunk = min(skb->len, tp->ucopy.len); + + local_bh_enable(); + if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) { + sk->err = EFAULT; + sk->error_report(sk); + } + local_bh_disable(); + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !skb->h.th->fin); + } + + if (!eaten) { +queue_and_out: + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->receive_queue, skb); + } dst_confirm(sk->dst_cache); - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) { + if(skb->len) + tcp_event_data_recv(tp, skb); + if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); - } else { - tcp_remember_ack(tp, skb->h.th, skb); - } + /* This may have eaten into a SACK block. */ if(tp->sack_ok && tp->num_sacks) tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); /* Turn on fast path. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - ntohl(TCP_FLAG_ACK) | - tp->snd_wnd); + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (eaten) + kfree_skb(skb); + + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1, POLL_IN); + } return; } - + /* An old packet, either a retransmit or some packet got lost. */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); + /* A retransmit, 2nd most common case. Force an imediate ack. + * + * It is impossible, seq is checked by top level. + */ + NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq)); tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; kfree_skb(skb); return; } @@ -1706,15 +1981,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks++; - tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; /* Disable header prediction. */ tp->pred_flags = 0; + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + skb_set_owner_r(skb, sk); + if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { @@ -1758,6 +2035,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } } } + return; } @@ -1767,7 +2045,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * room, then we will just have to discard the packet. */ -static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) +static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1777,11 +2055,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) skb_trim(skb, len - (th->doff*4)); if (skb->len == 0 && !th->fin) - return(0); + goto drop; /* * If our receive queue has grown past its limits shrink it. - * Make sure to do this before moving snd_nxt, otherwise + * Make sure to do this before moving rcv_nxt, otherwise * data might be acked for that we don't have enough room. */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { @@ -1789,7 +2067,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) /* Still not enough room. That can happen when * skb->true_size differs significantly from skb->len. */ - return 0; + goto drop; } } @@ -1799,29 +2077,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = tp->copied_seq; } + return; - /* Above, tcp_data_queue() increments delayed_acks appropriately. - * Now tell the user we may have some data. - */ - if (!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket,1, POLL_IN); - } - return(1); +drop: + kfree_skb(skb); } static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < tp->snd_cwnd) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk)) + tcp_check_probe_timer(sk, tp); } static __inline__ void tcp_data_snd_check(struct sock *sk) @@ -1832,57 +2101,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk) __tcp_data_snd_check(sk, skb); } -/* - * Adapt the MSS value used to make delayed ack decision to the - * real world. - * - * The constant 536 hasn't any good meaning. In IPv4 world - * MTU may be smaller, though it contradicts to RFC1122, which - * states that MSS must be at least 536. - * We use the constant to do not ACK each second - * packet in a stream of tiny size packets. - * It means that super-low mtu links will be aggressively delacked. - * Seems, it is even good. If they have so low mtu, they are weirdly - * slow. - * - * AK: BTW it may be useful to add an option to lock the rcv_mss. - * this way the beowulf people wouldn't need ugly patches to get the - * ack frequencies they want and it would be an elegant way to tune delack. - */ -static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len, lss; - - lss = tp->last_seg_size; - tp->last_seg_size = 0; - - /* skb->len may jitter because of SACKs, even if peer - * sends good full-sized frames. - */ - len = skb->len; - if (len >= tp->rcv_mss) { - tp->rcv_mss = len; - } else { - /* Otherwise, we make more careful check taking into account, - * that SACKs block is variable. - * - * "len" is invariant segment length, including TCP header. - */ - len = skb->tail - skb->h.raw; - if (len >= 536 + sizeof(struct tcphdr)) { - /* Subtract also invariant (if peer is RFC compliant), - * tcp header plus fixed timestamp option length. - * Resulting "len" is MSS free of SACK jitter. - */ - len -= tp->tcp_header_len; - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; - } - } -} - /* * Check if sending an ack is needed. */ @@ -1904,26 +2122,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * start in an expediant manner. */ - /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || - /* We will update the window "significantly" or... */ - tcp_raise_window(sk) || - /* We entered "quick ACK" mode or... */ + /* More than one full frame received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) || + /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || - /* We have out of order data */ - (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) { + /* We have out of order data or */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue) != NULL)) { /* Then ack it now */ tcp_send_ack(sk); } else { /* Else, send delayed ack. */ - tcp_send_delayed_ack(sk, HZ/2); + tcp_send_delayed_ack(sk); } } static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->delayed_acks == 0) { + if (tp->ack.pending == 0) { /* We sent a data segment already. */ return; } @@ -1975,7 +2192,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) */ if (tp->urg_seq == tp->copied_seq) tp->copied_seq++; /* Move the copied sequence on correctly */ - tp->urg_data = URG_NOTYET; + tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; /* Disable header prediction. */ @@ -1992,12 +2209,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == URG_NOTYET) { + if (tp->urg_data == TCP_URG_NOTYET) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { - tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } @@ -2014,7 +2231,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len static int prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff * skb; + struct sk_buff *skb; + int pruned = 0; SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); @@ -2024,7 +2242,9 @@ static int prune_queue(struct sock *sk) skb = __skb_dequeue_tail(&tp->out_of_order_queue); if(skb != NULL) { /* Free it all. */ - do { net_statistics[smp_processor_id()*2].OfoPruned += skb->len; + do { + pruned += skb->len; + net_statistics[smp_processor_id()*2].OfoPruned += skb->len; kfree_skb(skb); skb = __skb_dequeue_tail(&tp->out_of_order_queue); } while(skb != NULL); @@ -2059,13 +2279,47 @@ static int prune_queue(struct sock *sk) * if we are really having our buffer space abused we stop accepting * new receive data. * + * 8) The arguments are interesting, but I even cannot imagine + * what kind of arguments could force us to drop NICE, ALREADY + * RECEIVED DATA only to get one more packet? --ANK + * * FIXME: it should recompute SACK state and only remove enough * buffers to get into bounds again. The current scheme loses - * badly sometimes on links with large RTT, especially when - * the driver has high overhead per skb. - * (increasing the rcvbuf is not enough because it inflates the - * the window too, disabling flow control effectively) -AK + * badly sometimes on links with large RTT, especially when + * the driver has high overhead per skb. + * (increasing the rcvbuf is not enough because it inflates the + * the window too, disabling flow control effectively) -AK + * + * Mmm... Why not to scale it seprately then? Just replace + * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale + * and adjust it dynamically, when TCP window flow control + * fails? -ANK + */ + + /* F.e. one possible tactics is: */ + do { + u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned; + + /* This guy is not a good guy. I bet, he martirized cats, + * when was child and grew up to finished sadist. Clamp him! + */ + if (new_clamp > 3*tp->ack.rcv_mss) + new_clamp -= tp->ack.rcv_mss; + else + new_clamp = 2*tp->ack.rcv_mss; + tp->window_clamp = min(tp->window_clamp, new_clamp); + } while (0); + /* Though it should be made earlier, when we are still not + * congested. This header prediction logic sucks + * without true implementation of VJ algorithm. + * I am really anxious. How was it possible to combine + * header prediction and sending ACKs outside of recvmsg() context? + * They _are_ incompatible. We should not advance window so + * brainlessly and we should not advertise so huge window from the very + * beginning. BTW window "prediction" does not speedup anything! + * SIlly, silly, silly. */ + if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) return 0; @@ -2073,6 +2327,57 @@ static int prune_queue(struct sock *sk) return -1; } +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk); + else + err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen); + + if (!err) { +update: + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + local_bh_disable(); + return 0; + } + + if (err == -EFAULT) { + sk->err = EFAULT; + sk->error_report(sk); + goto update; + } + + local_bh_disable(); + return err; +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + int result; + + if (sk->lock.users) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +} + /* * TCP receive function for the ESTABLISHED state. * @@ -2080,7 +2385,33 @@ static int prune_queue(struct sock *sk) * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. - * - Out of order segments arrived. + * [ NOTE: actually, it was made incorrectly and nobody ever noticed + * this! Reason is clear: 1. Correct senders do not send + * to zero window. 2. Even if a sender sends to zero window, + * nothing terrible occurs. + * + * For now I cleaned this and fast path is really always disabled, + * when window is zero, but I would be more happy to remove these + * checks. Code will be only cleaner and _faster_. --ANK + * + * Later note. I've just found that slow path also accepts + * out of window segments, look at tcp_sequence(). So... + * it is the last argument: I repair all and comment out + * repaired code by TCP_FORMAL_WINDOW. + * [ I remember one rhyme from a chidren's book. (I apologize, + * the trasnlation is not rhymed 8)): people in one (jewish) village + * decided to build sauna, but divided to two parties. + * The first one insisted that battens should not be dubbed, + * another objected that foots will suffer of splinters, + * the first fended that dubbed wet battens are too slippy + * and people will fall and it is much more serious! + * Certaiinly, all they went to rabbi. + * After some thinking, he judged: "Do not be lazy! + * Certainly, dub the battens! But put them by dubbed surface down." + * ] + * ] + * + * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received @@ -2088,7 +2419,7 @@ static int prune_queue(struct sock *sk) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) - * - Unexpected TCP option. + * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. @@ -2116,7 +2447,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - /* RED-PEN. Using static variables to pass function arguments * cannot be good idea... */ @@ -2133,13 +2463,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - int tcp_header_len = th->doff*4; - - /* Timestamp header prediction */ + int tcp_header_len = tp->tcp_header_len; - /* Non-standard header f.e. SACKs -> slow path */ - if (tcp_header_len != tp->tcp_header_len) - goto slow_path; + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { @@ -2161,8 +2490,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto slow_path; /* Predicted packet is in window by definition. - seq == rcv_nxt and last_ack_sent <= rcv_nxt. - Hence, check seq<=last_ack_sent reduces to: + * seq == rcv_nxt and last_ack_sent <= rcv_nxt. + * Hence, check seq<=last_ack_sent reduces to: */ if (tp->rcv_nxt == tp->last_ack_sent) { tp->ts_recent = tp->rcv_tsval; @@ -2173,6 +2502,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (len <= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { + /* We know that such packets are checksummed + * on entry. + */ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); @@ -2182,19 +2514,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_INC_STATS_BH(TcpInErrs); goto discard; } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && - atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { - /* Bulk data transfer: receiver */ - __skb_pull(skb,tcp_header_len); + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { + int eaten = 0; - /* Is it possible to simplify this? */ - tcp_measure_rcv_mss(sk, skb); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sk->lock.users) { + eaten = 1; + + NET_INC_STATS_BH(TCPHPHitsToUser); + + if (tcp_copy_to_iovec(sk, skb, tcp_header_len)) + goto csum_error; + + __skb_pull(skb,tcp_header_len); + } else { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + goto step5; + + NET_INC_STATS_BH(TCPHPHits); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + + /* DO NOT notify forward progress here. + * It saves dozen of CPU instructions in fast path. --ANK + * And where is it signaled then ? -AK + * Nowhere. 8) --ANK + */ + __skb_queue_tail(&sk->receive_queue, skb); + skb_set_owner_r(skb, sk); + } - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - * And where is it signaled then ? -AK - */ - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; /* FIN bit check is not done since if FIN is set in @@ -2202,27 +2557,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,1, POLL_IN); - tcp_delack_estimator(tp); - tcp_remember_ack(tp, th, skb); + tcp_event_data_recv(tp, skb); +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else +#endif __tcp_ack_snd_check(sk, 0); + + if (eaten) + kfree_skb(skb); return 0; } /* Packet is in sequence, flags are trivial; - * only ACK is strange or we are tough on memory. - * Jump to step 5. + * only ACK is strange. Jump to step 5. */ + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; goto step5; } slow_path: + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + /* * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && tcp_paws_discard(tp, skb)) { if (!th->rst) { + NET_INC_STATS_BH(PAWSEstabRejected); tcp_send_ack(sk); goto discard; } @@ -2251,7 +2622,9 @@ slow_path: TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_wup, tp->rcv_wnd); } + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKLost); goto discard; } @@ -2279,11 +2652,8 @@ step5: /* Process urgent data. */ tcp_urg(sk, th, len); - { /* step 7: process the segment text */ - int queued = tcp_data(skb, sk, len); - - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); /* Be careful, tcp_data() may have put this into TIME_WAIT. */ if(sk->state != TCP_CLOSE) { @@ -2291,12 +2661,13 @@ step5: tcp_ack_snd_check(sk); } - if (!queued) { - discard: - kfree_skb(skb); - } - } + return 0; + +csum_error: + TCP_INC_STATS_BH(TcpInErrs); +discard: + kfree_skb(skb); return 0; } @@ -2328,6 +2699,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->dport = req->rmt_port; sock_lock_init(newsk); + bh_lock_sock(newsk); atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); @@ -2351,22 +2723,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->rcv_nxt = req->rcv_isn + 1; newtp->snd_nxt = req->snt_isn + 1; newtp->snd_una = req->snt_isn + 1; - newtp->srtt = 0; - newtp->ato = 0; + newtp->snd_sml = req->snt_isn + 1; + + tcp_delack_init(newtp); + if (skb->len >= 536) + newtp->ack.last_seg_size = skb->len; + + tcp_prequeue_init(newtp); + newtp->snd_wl1 = req->rcv_isn; newtp->snd_wl2 = req->snt_isn; - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - newtp->snd_wnd = ntohs(skb->h.th->window); - - newtp->max_window = newtp->snd_wnd; - newtp->pending = 0; newtp->retransmits = 0; - newtp->last_ack_sent = req->rcv_isn + 1; newtp->backoff = 0; + newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; + newtp->snd_ssthresh = 0x7fffffff; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -2374,22 +2751,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, * efficiently to them. -DaveM */ newtp->snd_cwnd = 2; - - newtp->rto = TCP_TIMEOUT_INIT; - newtp->packets_out = 0; - newtp->fackets_out = 0; - newtp->retrans_out = 0; - newtp->high_seq = 0; - newtp->snd_ssthresh = 0x7fffffff; newtp->snd_cwnd_cnt = 0; + newtp->high_seq = 0; + newtp->dup_acks = 0; - newtp->delayed_acks = 0; - init_timer(&newtp->retransmit_timer); - newtp->retransmit_timer.function = &tcp_retransmit_timer; - newtp->retransmit_timer.data = (unsigned long) newsk; - init_timer(&newtp->delack_timer); - newtp->delack_timer.function = &tcp_delack_timer; - newtp->delack_timer.data = (unsigned long) newsk; + tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->send_head = newtp->retrans_head = NULL; newtp->rcv_wup = req->rcv_isn + 1; @@ -2397,31 +2763,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->copied_seq = req->rcv_isn + 1; newtp->saw_tstamp = 0; + newtp->last_ack_sent = req->rcv_isn + 1; - init_timer(&newtp->probe_timer); - newtp->probe_timer.function = &tcp_probe_timer; - newtp->probe_timer.data = (unsigned long) newsk; newtp->probes_out = 0; newtp->syn_seq = req->rcv_isn; newtp->fin_seq = req->rcv_isn; newtp->urg_data = 0; - tcp_synq_init(newtp); - newtp->syn_backlog = 0; - if (skb->len >= 536) - newtp->last_seg_size = skb->len; + newtp->listen_opt = NULL; + newtp->accept_queue = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); /* Back to base struct sock members. */ newsk->err = 0; - newsk->ack_backlog = 0; - newsk->max_ack_backlog = SOMAXCONN; newsk->priority = 0; atomic_set(&newsk->refcnt, 1); +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); +#endif - spin_lock_init(&sk->timer_lock); - init_timer(&newsk->timer); - newsk->timer.function = &tcp_keepalive_timer; - newsk->timer.data = (unsigned long) newsk; if (newsk->keepopen) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newsk->socket = NULL; @@ -2440,6 +2800,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->snd_wscale = newtp->rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp,65535); } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale; + newtp->max_window = newtp->snd_wnd; + if (newtp->tstamp_ok) { newtp->ts_recent = req->ts_recent; newtp->ts_recent_stamp = xtime.tv_sec; @@ -2453,16 +2816,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, return newsk; } -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) -{ - if (seq == s_win) - return 1; - if (after(end_seq, s_win) && before(seq, e_win)) - return 1; - return (seq == e_win && seq == end_seq); -} - - /* * Process an incoming packet for SYN_RECV sockets represented * as an open_request. @@ -2470,30 +2823,28 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct open_request *req, - struct open_request *prev) + struct open_request **prev) { struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_opt ttp; - - /* If socket has already been created, process - packet in its context. - - We fall here only due to race, when packets were enqueued - to backlog of listening socket. - */ - if (req->sk) - return req->sk; + struct sock *child; ttp.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(NULL, th, &ttp, 0); - paws_reject = ttp.saw_tstamp && - (s32)(ttp.rcv_tsval - req->ts_recent) < 0; + if (ttp.saw_tstamp) { + ttp.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + paws_reject = tcp_paws_check(&ttp, th->rst); + } } /* Check for pure retransmited SYN. */ @@ -2517,7 +2868,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->class->rtx_syn_ack(sk, req); + req->class->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -2544,6 +2895,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); return NULL; } @@ -2572,35 +2925,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Invalid ACK: reset will be sent by listening socket */ if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) return sk; - - /* OK, ACK is valid, create big socket and - feed this segment to it. It will repeat all - the tests. THIS SEGMENT MUST MOVE SOCKET TO - ESTABLISHED STATE. If it will be dropped after - socket is created, wait for troubles. + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. */ - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - if (sk == NULL) + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; return NULL; + } - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->sk = sk; - return sk; + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; -embryonic_reset: tcp_synq_unlink(tp, req, prev); - tp->syn_backlog--; - tcp_dec_slow_timer(TCP_SLT_SYNACK); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } + +embryonic_reset: NET_INC_STATS_BH(EmbryonicRsts); if (!(flg & TCP_FLAG_RST)) req->class->send_reset(skb); - req->class->destructor(req); - tcp_openreq_free(req); + tcp_synq_drop(sk, req, prev); return NULL; } +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->state; + + if (child->lock.users == 0) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->state != state) + parent->data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + return ret; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { @@ -2608,25 +3004,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sk, th, tp, 0); -#ifdef CONFIG_TCP_TW_RECYCLE - if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst && - (s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) { - /* Old duplicate segment. We remember last - ts_recent from this host in timewait bucket. - - Actually, we could implement per host cache - to truncate timewait state after RTO. Paranoidal arguments - of rfc1337 are not enough to close this nice possibility. - */ - if (net_ratelimit()) - printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n"); - if (th->ack) - return 1; - goto discard; - } -#endif - if (th->ack) { /* rfc793: * "If the state is SYN-SENT then @@ -2646,10 +3023,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * We do not send data with SYN, so that RFC-correct * test reduces to: */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) return 1; + /* Check not from any RFC, but it is evident consequence + * of combining PAWS and usual SYN-SENT logic: ACK _is_ + * checked in SYN-SENT unlike another states, hence + * echoed tstamp must be checked too. + */ + if (tp->saw_tstamp) { + if (tp->rcv_tsecr == 0) { + /* Workaround for bug in linux-2.1 and early + * 2.2 kernels. Let's pretend that we did not + * see such timestamp to avoid bogus rtt value, + * calculated by tcp_ack(). + */ + tp->saw_tstamp = 0; + + /* But do not forget to store peer's timestamp! */ + if (th->syn) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; + } + } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 || + (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) { + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n")); + NET_INC_STATS_BH(PAWSActiveRejected); + return 1; + } + } + /* Now ACK is acceptable. * * "If the RST bit is set @@ -2689,18 +3092,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * because tcp_ack check is too weak for SYN-SENT) * causes moving socket to invalid semi-SYN-SENT, * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * * --ANK (990514) * - * I was wrong, I apologize. Bare ACK is valid. + * Bare ACK is valid, however. * Actually, RFC793 requires to send such ACK * in reply to any out of window packet. - * It is wrong, but Linux also does it sometimes. + * It is wrong, but Linux also send such + * useless ACKs sometimes. * --ANK (990724) */ @@ -2717,7 +3115,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; tp->fin_seq = TCP_SKB_CB(skb)->seq; @@ -2742,26 +3140,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); + if (sk->keepopen) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + tp->copied_seq = tp->rcv_nxt; + __tcp_fast_path_on(tp, tp->snd_wnd); + + if(!sk->dead) { + sk->state_change(sk); + sock_wake_async(sk->socket, 0, POLL_OUT); + } + if (tp->write_pending) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * - * How to make this correctly? + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK */ - tp->delayed_acks++; - if (tp->ato == 0) - tp->ato = tp->rto; - tcp_send_delayed_ack(sk, tp->rto); + tp->ack.pending = 1; + tp->ack.lrcvtime = tcp_time_stamp; + tcp_enter_quickack_mode(tp); + tp->ack.pingpong = 1; + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + goto discard; } else { tcp_send_ack(sk); } - - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket, 0, POLL_OUT); - } return -1; } @@ -2777,6 +3184,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } + /* PAWS check. */ + if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0)) + goto discard; + if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -2800,8 +3211,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -2960,6 +3372,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, #endif ) { if (!th->rst) { + NET_INC_STATS_BH(DelayedACKLost); + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); } goto discard; @@ -3011,28 +3425,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->copied_seq = tp->rcv_nxt; /* Note, that this wakeup is only for marginal - crossed SYN case. Passively open sockets - are not waked up, because sk->sleep == NULL - and sk->socket == NULL. + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sleep == NULL + * and sk->socket == NULL. */ - if (!sk->dead && sk->sleep) { - wake_up_interruptible(sk->sleep); + if (!sk->dead) { + sk->state_change(sk); sock_wake_async(sk->socket,0,POLL_OUT); } tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; + tp->snd_wnd = ntohs(th->window) << tp->snd_wscale; tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; /* tcp_ack considers this ACK as duplicate - * and does not calculate rtt. It is wrong. + * and does not calculate rtt. * Fix it at least with timestamps. */ if (tp->saw_tstamp && !tp->srtt) tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED); tcp_init_metrics(sk); + tcp_fast_path_on(tp); } else { SOCK_DEBUG(sk, "bad ack\n"); return 1; @@ -3041,26 +3456,50 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { - sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) - sk->state_change(sk); - else - tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); + sk->shutdown |= SEND_SHUTDOWN; dst_confirm(sk->dst_cache); + + if (!sk->dead) { + /* Wake up lingering close() */ + sk->state_change(sk); + } else { + int tmo; + + if (tp->linger2 < 0 || + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_done(sk); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sk->lock.users) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + tcp_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } } break; - case TCP_CLOSING: + case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto discard; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { - tcp_set_state(sk,TCP_CLOSE); tcp_update_metrics(sk); tcp_done(sk); goto discard; @@ -3080,27 +3519,22 @@ step6: case TCP_CLOSING: if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) break; - case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { + if (sk->shutdown & RCV_SHUTDOWN) { if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk); return 1; } } - + /* Fall through */ case TCP_ESTABLISHED: - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - */ - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); + queued = 1; break; } |