diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-04-29 21:13:14 +0000 |
---|---|---|
committer | <ralf@linux-mips.org> | 1997-04-29 21:13:14 +0000 |
commit | 19c9bba94152148523ba0f7ef7cffe3d45656b11 (patch) | |
tree | 40b1cb534496a7f1ca0f5c314a523c69f1fee464 /net/ipv4/tcp_input.c | |
parent | 7206675c40394c78a90e74812bbdbf8cf3cca1be (diff) |
Import of Linux/MIPS 2.1.36
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1706 |
1 files changed, 818 insertions, 888 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 076568961..ab2b1ef82 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: @(#)tcp_input.c 1.0.16 05/25/93 + * Version: $Id: tcp_input.c,v 1.50 1997/04/22 02:53:12 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -21,12 +21,6 @@ */ /* - * TODO - * - A better sock cache - * - */ - -/* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. @@ -42,14 +36,16 @@ * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. */ #include <linux/config.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <net/tcp.h> - - +#include <linux/ipsec.h> typedef void (*tcp_sys_cong_ctl_t)(struct sock *sk, u32 seq, u32 ack, @@ -61,6 +57,12 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt); int sysctl_tcp_cong_avoidance = 0; +int sysctl_tcp_hoe_retransmits = 0; +int sysctl_tcp_sack = 0; +int sysctl_tcp_tsack = 0; +int sysctl_tcp_timestamps = 0; +int sysctl_tcp_window_scaling = 0; + static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; @@ -76,9 +78,7 @@ static void tcp_delack_estimator(struct tcp_opt *tp) { int m; - /* - * Delayed ACK time estimator. - */ + /* Delayed ACK time estimator. */ m = jiffies - tp->lrcvtime; @@ -87,12 +87,10 @@ static void tcp_delack_estimator(struct tcp_opt *tp) if (m < 0) return; - /* - * if the mesured value is bigger than + /* if the mesured value is bigger than * twice the round trip time ignore it. */ - if ((m << 2) <= tp->srtt) - { + if ((m << 2) <= tp->srtt) { m -= (tp->iat >> 3); tp->iat += m; @@ -106,18 +104,21 @@ static void tcp_delack_estimator(struct tcp_opt *tp) if (tp->ato < HZ/50) tp->ato = HZ/50; - } - else + } else tp->ato = 0; } -/* - * Called on frames that were known _not_ to have been - * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. - * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics */ -extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m; /* @@ -126,8 +127,7 @@ extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". - */ - /* + * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev */ @@ -144,55 +144,94 @@ extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) m -= (tp->mdev >> 2); /* similar update on mdev */ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ } else { - /* no previous measure. */ + /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<2; /* make sure rto = 3*rtt */ } +} +/* Calculate rto without backoff. This is the second half of Van Jacobsons + * routine refered to above. + */ - /* - * Now update timeout. Note that this removes any backoff. - */ - +static __inline__ void tcp_set_rto(struct tcp_opt *tp) +{ tp->rto = (tp->srtt >> 3) + tp->mdev; + tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); +} + + +/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound + * on packet lifetime in the internet. We need the HZ/5 lower + * bound to behave correctly against BSD stacks with a fixed + * delayed ack. + * FIXME: It's not entirely clear this lower bound is the best + * way to avoid the problem. Is it possible to drop the lower + * bound and still avoid trouble with BSD stacks? Perhaps + * some modification to the RTO calculation that takes delayed + * ack bais into account? This needs serious thought. -- erics + */ +static __inline__ void tcp_bound_rto(struct tcp_opt *tp) +{ if (tp->rto > 120*HZ) tp->rto = 120*HZ; - - /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ if (tp->rto < HZ/5) tp->rto = HZ/5; +} + +/* WARNING: this must not be called if tp->saw_timestamp was false. */ - tp->backoff = 0; +extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) +{ + /* From draft-ietf-tcplw-high-performance: the correct + * test is last_ack_sent <= end_seq. + * (RFC1323 stated last_ack_sent < end_seq.) + */ + if (!before(end_seq,tp->last_ack_sent)) { + tp->ts_recent = tp->rcv_tsval; + /* FIXME: need a corse timestamp. Days uptime + * would be good. + */ + tp->ts_recent_stamp = jiffies; + } +} + +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp) +{ + /* FIXME: must check that ts_recent is not + * more than 24 days old here. Yuck. + */ + return (tp->rcv_tsval-tp->ts_recent < 0); +} + + +static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + u32 end_window = tp->rcv_wup + tp->rcv_wnd; + + if (tp->rcv_wnd) { + if (!before(seq, tp->rcv_nxt) && before(seq, end_window)) + return 1; + + if ((end_seq - seq) && after(end_seq, tp->rcv_nxt) && + !after(end_seq, end_window)) + return 1; + } + + return 0; } - /* * This functions checks to see if the tcp header is actually acceptable. */ -extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt) +extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { - u32 end_window = tp->rcv_wup + tp->rcv_wnd; - u32 end_seq = seg_nxt; - - /* - * When the window is open (most common case) - * we want to accept segments if they have yet unseen data - * or in the case of a dataless segment if seg.seq == rcv.nxt - * this means: - * - * if (seq == end_seq) - * end_seq >= rcv.nxt - * else - * end_seq > rcv.nxt - */ - - if (seq == end_seq) - end_seq++; + if (seq == tp->rcv_nxt) + return (tp->rcv_wnd || (end_seq == seq)); - return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) || - (seq == end_window && seq == end_seq)); + return __tcp_sequence(tp, seq, end_seq); } /* @@ -203,9 +242,8 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt) static int tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; - /* - * We want the right error as BSD sees it (and indeed as we do). - */ + + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_TIME_WAIT: break; @@ -217,7 +255,7 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) break; default: sk->err = ECONNRESET; - } + }; #ifdef CONFIG_TCP_RFC1337 /* * Time wait assassination protection [RFC1337] @@ -227,8 +265,7 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) * Ian Heavens has since shown this is an inadequate fix for the protocol * bug in question. */ - if(sk->state!=TCP_TIME_WAIT) - { + if(sk->state!=TCP_TIME_WAIT) { tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; } @@ -242,34 +279,30 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) return(0); } - /* - * Look for tcp options. Parses everything but only knows about MSS. - * This routine is always called with the packet containing the SYN. - * However it may also be called with the ack to the SYN. So you - * can't assume this is always the SYN. It's always called after - * we have set up sk->mtu to our own MTU. - * - * We need at minimum to add PAWS support here. Possibly large windows - * as Linux gets deployed on 100Mb/sec networks. + * Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + * FIXME: surely this can be more efficient. -- erics */ -int tcp_parse_options(struct tcphdr *th) +void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); - int mss = 0; - + ptr = (unsigned char *)(th + 1); - - while(length>0) - { + tp->sacks = 0; + tp->saw_tstamp = 0; + + while(length>0) { int opcode=*ptr++; int opsize=*ptr++; - switch(opcode) - { + if (length - opsize < 0) /* Don't parse partial options */ + break; + switch(opcode) { case TCPOPT_EOL: - return 0; + return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; ptr--; /* the opsize=*ptr++ above was a mistake */ @@ -277,25 +310,86 @@ int tcp_parse_options(struct tcphdr *th) default: if(opsize<=2) /* Avoid silly options looping forever */ - return 0; - switch(opcode) - { + return; + switch(opcode) { case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) - { - mss = ntohs(*(unsigned short *)ptr); - } + if(opsize==TCPOLEN_MSS && th->syn) { + tp->in_mss = ntohs(*(__u16 *)ptr); + if (tp->in_mss == 0) + tp->in_mss = 536; + } break; - /* Add other options here as people feel the urge to implement stuff like large windows */ + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn) + if (sysctl_tcp_window_scaling) + tp->snd_wscale = *(__u8 *)ptr; + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn) + if (sysctl_tcp_sack) + tp->sack_ok = 1; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + /* Cheaper to set again then to + * test syn. Optimize this? + */ + if (sysctl_tcp_timestamps) + tp->tstamp_ok = 1; + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + break; + case TCPOPT_SACK: + tp->sacks = (opsize-2)>>3; + if (tp->sacks<<3 == opsize-2) { + int i; + for (i = 0; i < tp->sacks; i++) { + tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]); + tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]); + } + } else + tp->sacks = 0; } ptr+=opsize-2; length-=opsize; - } + }; } +} - return mss; +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_option(). + * This should probably get extended for timestamps + SACK as well. + * Assembly code anyone? -- erics + */ +static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp) +{ + if (tp->tcp_header_len == sizeof(struct tcphdr)) + return 0; + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->saw_tstamp = 0; + tp->sacks = 0; + return 0; + } else if (th->doff == (sizeof(struct tcphdr)>>2)+3) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->saw_tstamp = 1; + tp->sacks = 0; + tp->rcv_tsval = ntohl(*++ptr); + tp->rcv_tsecr = ntohl(*++ptr); + return 1; + } + } + tcp_parse_options(th,tp); + return 1; } +#if 0 + +/* + * This is the old fast retransmit code. It will go away eventually. -- erics + */ /* * See draft-stevens-tcpca-spec-01 for documentation. @@ -305,6 +399,15 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + /* FIXME: if we are already retransmitting should this code + * be skipped? [Floyd high_seq check sort of does this] + * The case I'm worried about is falling into a fast + * retransmit on a link with a congestion window of 1 or 2. + * There was some evidence in 2.0.x that this was problem + * on really slow links (1200 or 2400 baud). I need to + * try this situation again and see what happens. + */ + /* * An ACK is a duplicate if: * (1) it has the same sequence number as the largest number we've @@ -316,55 +419,169 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * The packet acked data after high_seq; */ - if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) && - after(ack, tp->high_seq)) - { - - sk->dup_acks++; - - - /* - * 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. + if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { + /* 1. When the third duplicate ack is received, set ssthresh + * to one half the current congestion window, but no less + * than two segments. Retransmit the missing segment. */ - - if (sk->dup_acks == 3) - { - sk->ssthresh = max(sk->cong_window >> 1, 2); - sk->cong_window = sk->ssthresh + 3; - tcp_do_retransmit(sk, 0); + if (tp->high_seq == 0 || after(ack, tp->high_seq)) { + tp->dup_acks++; + + if (tp->dup_acks == 3) { + tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); + tp->snd_cwnd = tp->snd_ssthresh + 3; + tcp_do_retransmit(sk, 0); + + /* Careful not to timeout just after fast + * retransmit! + */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } } - /* - * 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... + /* 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode" + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode". */ - - if (sk->dup_acks > 3) - { - sk->cong_window++; + if (tp->dup_acks >= 3) { + tp->dup_acks++; + tp->snd_cwnd++; + } + } else { + /* 3. When the next ACK arrives that acknowledges new data, + * set cwnd to ssthresh. + */ + if (tp->dup_acks >= 3) { + tp->retrans_head = NULL; + tp->snd_cwnd = max(tp->snd_ssthresh, 1); + tp->retransmits = 0; } + tp->dup_acks = 0; + + /* FIXME: This is wrong if the new ack that arrives + * is below the value for high_seq. + */ + tp->high_seq = 0; } - else - { - /* - * 3. When the next ACK arrives that acknowledges new data, - * set cwnd to ssthresh +} +#endif + +#define FLAG_DATA 0x01 +#define FLAG_WIN_UPDATE 0x02 +#define FLAG_DATA_ACKED 0x04 + +static __inline__ void clear_fast_retransmit(struct sock *sk) { + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + if (tp->dup_acks > 3) { + tp->retrans_head = NULL; + tp->snd_cwnd = max(tp->snd_ssthresh, 1); + } + tp->dup_acks = 0; +} + +/* + * NOTE: This code assumes that tp->dup_acks gets cleared when a + * retransmit timer fires. + */ + +static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) +{ + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + /* + * Note: If not_dup is set this implies we got a + * data carrying packet or a window update. + * This carries no new information about possible + * lost packets, so we have to ignore it for the purposes + * of counting duplicate acks. Ideally this does not imply we + * should stop our fast retransmit phase, more acks may come + * later without data to help us. Unfortunately this would make + * the code below much more complex. For now if I see such + * a packet I clear the fast retransmit phase. + */ + + if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { + /* This is the standard reno style fast retransmit branch. */ + + /* 1. When the third duplicate ack is received, set ssthresh + * to one half the current congestion window, but no less + * than two segments. Retransmit the missing segment. + */ + if (tp->high_seq == 0 || after(ack, tp->high_seq)) { + tp->dup_acks++; + if (tp->dup_acks == 3) { + tp->dup_acks++; + tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); + tp->snd_cwnd = tp->snd_ssthresh + 3; + tp->high_seq = tp->snd_nxt; + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + } + + /* 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... + * + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode" + */ + if (tp->dup_acks > 3) + tp->snd_cwnd++; + } else if (tp->high_seq != 0) { + /* In this branch we deal with clearing the Floyd style + * block on duplicate fast retransmits, and if requested + * we do Hoe style secondary fast retransmits. */ + if (!before(ack,tp->high_seq) || (not_dup&FLAG_DATA) != 0) { + /* Once we have acked all the packets up to high_seq + * we are done this fast retransmit phase. + * Alternatively data arrived. In this case we + * Have to abort the fast retransmit attempt. + * Note that we do want to accept a window + * update since this is expected with Hoe's algorithm. + */ + clear_fast_retransmit(sk); - if (sk->dup_acks >= 3) - { - sk->tp_pinfo.af_tcp.retrans_head = NULL; - sk->cong_window = sk->ssthresh; - sk->retransmits = 0; + /* After we have cleared up to high_seq we can + * clear the Floyd style block. + */ + if (after(ack,tp->high_seq)) + tp->high_seq = 0; + } else if (tp->dup_acks >= 3) { + if (sysctl_tcp_hoe_retransmits) { + /* Hoe Style. We didn't ack the whole + * window. Take this as a cue that + * another packet was lost and retransmit it. + * Don't muck with the congestion window here. + * Note that we have to be careful not to + * act if this was a window update and it + * didn't ack new data, since this does + * not indicate a packet left the system. + * We can test this by just checking + * if ack changed from snd_una, since + * the only way to get here without changing + * advancing from snd_una is if this was a + * window update. + */ + if (ack != tp->snd_una && before(ack,tp->high_seq)) { + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, + tp->rto); + } + } else { + /* Reno style. We didn't ack the whole + * window, now we have to drop out of + * fast retransmit and wait for a timeout. + */ + clear_fast_retransmit(sk); + } } - sk->dup_acks = 0; + } else { + /* Clear any aborted fast retransmit starts. */ + tp->dup_acks = 0; } - } /* @@ -379,148 +596,114 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) { - /* - * From: + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int actual, expected; + unsigned int inv_rtt, inv_basertt, inv_basebd; + u32 snt_bytes; + + /* From: * TCP Vegas: New Techniques for Congestion * Detection and Avoidance. - * * * Warning: This code is a scratch implementation taken * from the paper only. The code they distribute seams * to have improved several things over the initial spec. */ - struct tcp_opt * tp; - unsigned int Actual, Expected; - unsigned int inv_rtt, inv_basertt; - u32 snt_bytes; - - - tp = &(sk->tp_pinfo.af_tcp); - if (!seq_rtt) seq_rtt = 1; - + if (tp->basertt) tp->basertt = min(seq_rtt, tp->basertt); else tp->basertt = seq_rtt; - /* - * - * Actual = throughput for this segment. - * Expected = number_of_bytes in transit / BaseRTT - * + /* actual = throughput for this segment. + * expected = number_of_bytes in transit / BaseRTT */ - snt_bytes = (ack - seq) << SHIFT_FACTOR; - inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt; - - Actual = snt_bytes * inv_rtt; + snt_bytes = ack - seq; + inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt; inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt; - Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) * inv_basertt; - /* - * Slow Start - */ - - if (sk->cong_window < sk->ssthresh && + actual = snt_bytes * inv_rtt; + + expected = (tp->snd_nxt - tp->snd_una) * inv_basertt; + + /* XXX sk->mss should move into tcp_opt as well -DaveM */ + inv_basebd = sk->mss * inv_basertt; + + /* Slow Start */ + if (tp->snd_cwnd < tp->snd_ssthresh && (seq == tp->snd_nxt || - (((Expected - Actual) <= - ((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss * inv_basertt)) - ) - )) - { - /* - * "Vegas allows exponential growth only every other - * RTT" - */ - - if (!(sk->cong_count++)) - { - sk->cong_window++; - sk->cong_count = 0; + (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) { + /* "Vegas allows exponential growth only every other RTT" */ + if (tp->snd_cwnd_cnt++) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; } - } - else - { - /* - * Congestion Avoidance - */ - - if (Expected - Actual <= - ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss * inv_basertt)) - { + } else { + /* Congestion Avoidance */ + if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) { /* Increase Linearly */ - - if (sk->cong_count++ >= sk->cong_window) - { - sk->cong_window++; - sk->cong_count = 0; + if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; } } - - if (Expected - Actual >= - ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss * inv_basertt)) - { + + if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) { /* Decrease Linearly */ - - if (sk->cong_count++ >= sk->cong_window) - { - sk->cong_window--; - sk->cong_count = 0; + if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { + tp->snd_cwnd--; + tp->snd_cwnd_cnt = 0; } - - /* Never less than 2 segments */ - if (sk->cong_window < 2) - sk->cong_window = 2; + + /* Never less than 2 segments. */ + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; } } } static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * This is Jacobson's slow start and congestion avoidance. + /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. Because we keep cong_window in * integral mss's, we can't do cwnd += 1 / cwnd. * Instead, maintain a counter and increment it once every * cwnd times. + * FIXME: Check to be sure the mathematics works out right + * on this trick when we have to reduce the congestion window. + * The snd_cwnd_cnt has to be reset properly when reduction events + * happen. + * FIXME: What happens when the congestion window gets larger + * than the maximum receiver window by some large factor + * Suppose the pipeline never looses packets for a long + * period of time, then traffic increases causing packet loss. + * The congestion window should be reduced, but what it should + * be reduced to is not clear, since 1/2 the old window may + * still be larger than the maximum sending rate we ever achieved. */ - - if (sk->cong_window <= sk->ssthresh) - { - /* - * In "safe" area, increase - */ - - sk->cong_window++; - } - else - { - /* - * In dangerous area, increase slowly. - * In theory this is - * sk->cong_window += 1 / sk->cong_window + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. In theory this is + * tp->snd_cwnd += 1 / tp->snd_cwnd */ - - if (sk->cong_count >= sk->cong_window) { - - sk->cong_window++; - sk->cong_count = 0; - } - else - sk->cong_count++; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; } } -#define FLAG_DATA 0x01 -#define FLAG_WIN_UPDATE 0x02 -#define FLAG_DATA_ACKED 0x04 - static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, __u32 *seq_rtt) { @@ -528,52 +711,46 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, struct sk_buff *skb; unsigned long now = jiffies; int acked = 0; - - while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) - { + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { #ifdef TCP_DEBUG /* Check for a bug. */ - if (skb->next != (struct sk_buff*) &sk->write_queue && - after(skb->end_seq, skb->next->seq)) - printk("INET: tcp_input.c: *** " + after(skb->end_seq, skb->next->seq)) + printk(KERN_DEBUG "INET: tcp_input.c: *** " "bug send_list out of order.\n"); #endif - /* - * If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the - * other end. + /* If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived the + * other end. */ - if (after(skb->end_seq, ack)) break; - - if (sk->debug) - { - printk(KERN_DEBUG "removing seg %x-%x from " - "retransmit queue\n", skb->seq, skb->end_seq); - } - + + SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n", + skb->seq, skb->end_seq); + acked = FLAG_DATA_ACKED; - atomic_dec(&sk->packets_out); + /* FIXME: packet counting may break if we have to + * do packet "repackaging" for stacks that don't + * like overlapping packets. + */ + tp->packets_out--; *seq = skb->seq; *seq_rtt = now - skb->when; - - skb_unlink(skb); - skb->free = 1; + + skb_unlink(skb); kfree_skb(skb, FREE_WRITE); } - if (acked && !sk->dead) - { + if (acked) { tp->retrans_head = NULL; - sk->write_space(sk); + if (!sk->dead) + sk->write_space(sk); } - return acked; } @@ -581,27 +758,18 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * Our probe was answered - */ + /* Our probe was answered. */ tp->probes_out = 0; - /* - * Was it a usable window open ? - */ + /* Was it a usable window open? */ /* should always be non-null */ if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, tp->send_head->end_seq)) - { + !before (ack + tp->snd_wnd, tp->send_head->end_seq)) { tp->backoff = 0; tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - - } - else - { + } else { tcp_reset_xmit_timer(sk, TIME_PROBE0, min(tp->rto << tp->backoff, 120*HZ)); } @@ -614,192 +782,164 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack_seq, u32 ack, int len) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int flag = 0; u32 seq = 0; u32 seq_rtt = 0; struct sk_buff *skb; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - { + if (tp->pending == TIME_KEEPOPEN) tp->probes_out = 0; - } tp->rcv_tstamp = jiffies; - - /* - * If the ack is newer than sent or older than previous acks - * then we can probably ignore it. + + /* If the ack is newer than sent or older than previous acks + * then we can probably ignore it. */ - if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) goto uninteresting_ack; - /* - * If there is data set flag 1 - */ - - if (len != th->doff*4) - { + /* If there is data set flag 1 */ + if (len != th->doff*4) { flag |= FLAG_DATA; tcp_delack_estimator(tp); } - /* - * Update our send window - */ + /* Update our send window. */ - /* - * This is the window update code as per RFC 793 - * snd_wl{1,2} are used to prevent unordered - * segments from shrinking the window + /* This is the window update code as per RFC 793 + * snd_wl{1,2} are used to prevent unordered + * segments from shrinking the window */ + if (before(tp->snd_wl1, ack_seq) || + (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { + unsigned long nwin = ntohs(th->window); - if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) || - (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) - { - tp->snd_wnd = ntohs(th->window); - tp->snd_wl1 = ack_seq; - tp->snd_wl2 = ack; + if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { + flag |= FLAG_WIN_UPDATE; + tp->snd_wnd = nwin; - flag |= FLAG_WIN_UPDATE; + tp->snd_wl1 = ack_seq; + tp->snd_wl2 = ack; - if (tp->snd_wnd > sk->max_window) - { - sk->max_window = tp->snd_wnd; + if (nwin > tp->max_window) + tp->max_window = nwin; } } - - /* - * We passed data and got it acked, remove any soft error - * log. Something worked... + /* We passed data and got it acked, remove any soft error + * log. Something worked... */ - sk->err_soft = 0; - /* - * If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. */ - if (tp->pending == TIME_PROBE0) - { tcp_ack_probe(sk, ack); - } - - /* - * See if we can take anything off of the retransmit queue. - */ + /* See if we can take anything off of the retransmit queue. */ if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt)) flag |= FLAG_DATA_ACKED; - - /* - * if we where retransmiting don't count rtt estimate - */ - - if (sk->retransmits) - { - if (sk->packets_out == 0) - sk->retransmits = 0; - } - else - { - /* - * Note that we only reset backoff and rto in the - * rtt recomputation code. And that doesn't happen - * if there were retransmissions in effect. So the - * first new packet after the retransmissions is - * sent with the backoff still in effect. Not until - * we get an ack from a non-retransmitted packet do - * we reset the backoff and rto. This allows us to deal - * with a situation where the network delay has increased - * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) + /* If we have a timestamp, we always do rtt estimates. */ + if (tp->saw_tstamp) { + /* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) */ - - if (flag & FLAG_DATA_ACKED) - { - tcp_rtt_estimator(tp, seq_rtt); - - (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + seq_rtt = (jiffies-tp->rcv_tsecr); + tcp_rtt_estimator(tp, seq_rtt); + if (tp->retransmits) { + if (tp->packets_out == 0) { + tp->retransmits = 0; + tp->backoff = 0; + tcp_set_rto(tp); + } else { + /* Still retransmitting, use backoff */ + tcp_set_rto(tp); + tp->rto = tp->rto << tp->backoff; + } + } else { + tcp_set_rto(tp); + if (flag && FLAG_DATA_ACKED) + (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + } + /* NOTE: safe here so long as cong_ctl doesn't use rto */ + tcp_bound_rto(tp); + } else { + /* If we were retransmiting don't count rtt estimate. */ + if (tp->retransmits) { + if (tp->packets_out == 0) + tp->retransmits = 0; + } else { + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + if (flag & FLAG_DATA_ACKED) { + tp->backoff = 0; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tcp_bound_rto(tp); + (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + } } } - -#ifdef TCP_DEBUG - - /* Sanity check out packets_out counter */ - if (skb_queue_len(&sk->write_queue) == 0 || - ack == tp->snd_nxt ) - { - if (sk->packets_out) - { - printk(KERN_DEBUG "tcp_ack: packets_out %d\n", - sk->packets_out); - sk->packets_out = 0; - } - } -#endif - - if (sk->packets_out) - { - if (flag & FLAG_DATA_ACKED) - { + if (tp->packets_out) { + if (flag & FLAG_DATA_ACKED) { long when; - + skb = skb_peek(&sk->write_queue); - when = tp->rto - (jiffies - skb->when); - - if (when <= 0) - { + + /* FIXME: This assumes that when we are retransmitting + * we should only ever respond with one packet. + * This means congestion windows should not grow + * during recovery. In 2.0.X we allow the congestion + * window to grow. It is not clear to me which + * decision is correct. The RFCs should be double + * checked as should the behavior of other stacks. + * Also note that if we do want to allow the + * congestion window to grow during retransmits + * we have to fix the call to congestion window + * updates so that it works during retransmission. + */ + if (tp->retransmits) { tp->retrans_head = NULL; - /* - * This is tricky. We are retransmiting a + + /* This is tricky. We are retransmiting a * segment of a window when congestion occured. */ tcp_do_retransmit(sk, 0); - tcp_reset_xmit_timer(sk, TIME_RETRANS, - tp->rto); - } - else + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } - } - else + } else tcp_clear_xmit_timer(sk, TIME_RETRANS); - - - /* - * Remember the highest ack received. - */ - - tp->snd_una = ack; tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE))); + /* Remember the highest ack received. */ + tp->snd_una = ack; return 1; uninteresting_ack: - tcp_fast_retrans(sk, ack, 0); - - if(sk->debug) - printk("Ack ignored %u %u\n",ack,tp->snd_nxt); - + SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); return 0; } - /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence @@ -813,53 +953,46 @@ uninteresting_ack: * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. - * */ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - sk->fin_seq = skb->end_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* XXX This fin_seq thing should disappear... -DaveM */ + tp->fin_seq = skb->end_seq; tcp_send_ack(sk); - if (!sk->dead) - { + if (!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 1); } - switch(sk->state) - { + switch(sk->state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_ESTABLISHED: - /* - * move to CLOSE_WAIT - */ - + /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - if (th->rst) sk->shutdown = SHUTDOWN_MASK; break; case TCP_CLOSE_WAIT: case TCP_CLOSING: - /* - * received a retransmission of the FIN, do + /* Received a retransmission of the FIN, do * nothing. */ break; case TCP_TIME_WAIT: - /* - * received a retransmission of the FIN, + /* Received a retransmission of the FIN, * restart the TIME_WAIT timer. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return(0); case TCP_FIN_WAIT1: - /* - * This case occurs when a simultaneous close + /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. * @@ -869,202 +1002,128 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * FIN lost hang). The TIME_WRITE code is already * correct for handling this timeout. */ - tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: - /* - * received a FIN -- send ACK and enter TIME_WAIT - */ + /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - sk->shutdown|=SHUTDOWN_MASK; + sk->shutdown |= SHUTDOWN_MASK; tcp_set_state(sk,TCP_TIME_WAIT); break; case TCP_CLOSE: - /* - * already in CLOSE - */ + /* Already in CLOSE. */ break; default: + /* FIXME: Document whats happening in this case. -DaveM */ tcp_set_state(sk,TCP_LAST_ACK); /* Start the timers. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return(0); - } + }; return(0); } - - - /* - * This one checks to see if we can put data from the - * out_of_order queue into the receive_queue - */ - -static void tcp_ofo_queue(struct sock *sk) +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) { - struct sk_buff * skb; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* FIXME: out_of_order_queue is a strong tcp_opt candidate... -DaveM */ while ((skb = skb_peek(&sk->out_of_order_queue))) { - if (after(skb->seq, tp->rcv_nxt)) break; if (!after(skb->end_seq, tp->rcv_nxt)) { - - if (sk->debug) - printk("ofo packet was allready received \n"); - + SOCK_DEBUG(sk, "ofo packet was allready received \n"); skb_unlink(skb); kfree_skb(skb, FREE_READ); - continue; } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); - if (sk->debug) - printk("ofo requeuing : rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); - skb_unlink(skb); - - skb_queue_tail(&sk->receive_queue, skb); - - tp->rcv_nxt = skb->end_seq; } } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct sk_buff * skb1; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct sk_buff *skb1; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * Queue data for delivery to the user - * Packets in sequence go to the receive queue - * Out of sequence packets to out_of_order_queue + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to out_of_order_queue. */ - - if (skb->seq == tp->rcv_nxt) { - - /* - * Ok. In sequence. - */ - - + /* Ok. In sequence. */ +queue_and_out: skb_queue_tail(&sk->receive_queue, skb); - - tp->rcv_nxt = skb->end_seq; - tcp_ofo_queue(sk); - if (skb_queue_len(&sk->out_of_order_queue) == 0) tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); - return; } - /* - * Not in sequence - * either a retransmit or some packet got lost - */ - + /* Not in sequence, either a retransmit or some packet got lost. */ if (!after(skb->end_seq, tp->rcv_nxt)) { - - /* - * A retransmit. - * 2nd most common case. - * force an imediate ack - */ + /* A retransmit, 2nd most common case. Force an imediate ack. */ + SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq); - if (sk->debug) - printk("retransmit received: seq %X\n", skb->seq); - - sk->delayed_acks = MAX_DELAY_ACK; + tp->delayed_acks = MAX_DELAY_ACK; kfree_skb(skb, FREE_READ); - return; } - if (before(skb->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); - /* - * Partial packet - * seq < rcv_next < end_seq - */ - - if (sk->debug) - printk("partial packet: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); - - skb_queue_tail(&sk->receive_queue, skb); - - - tp->rcv_nxt = skb->end_seq; - - tcp_ofo_queue(sk); - - if (skb_queue_len(&sk->out_of_order_queue) == 0) - tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); - - return; + goto queue_and_out; } - /* - * Ok. This is an out_of_order segment - */ - - /* Force an ack */ - - sk->delayed_acks = MAX_DELAY_ACK; - - /* - * disable header predition - */ + /* Ok. This is an out_of_order segment, force an ack. */ + tp->delayed_acks = MAX_DELAY_ACK; + /* Disable header predition. */ tp->pred_flags = 0; - if (sk->debug) - printk("out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); if (skb_peek(&sk->out_of_order_queue) == NULL) { skb_queue_head(&sk->out_of_order_queue,skb); - } - else + } else { for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) { - - /* allready there */ - if (skb->seq==skb1->seq && skb->len>=skb1->len) - { - skb_append(skb1,skb); + /* Already there. */ + if (skb->seq == skb1->seq && skb->len >= skb1->len) { + skb_append(skb1, skb); skb_unlink(skb1); - kfree_skb(skb1,FREE_READ); + kfree_skb(skb1, FREE_READ); break; } - if (after(skb->seq, skb1->seq)) - { + if (after(skb->seq, skb1->seq)) { skb_append(skb1,skb); break; } - - /* - * See if we've hit the start. If so insert. - */ + + /* See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&sk->out_of_order_queue)) { skb_queue_head(&sk->out_of_order_queue,skb); break; } } - + } } @@ -1077,53 +1136,36 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); th = skb->h.th; - skb_pull(skb,th->doff*4); - skb_trim(skb,len-(th->doff*4)); + skb_pull(skb, th->doff*4); + skb_trim(skb, len - (th->doff*4)); if (skb->len == 0 && !th->fin) - { return(0); - } - - /* - * FIXME: don't accept data after the receved fin - */ - /* - * The bytes in the receive read/assembly queue has increased. - * Needed for the low memory discard algorithm - */ - - sk->bytes_rcv += skb->len; - - /* - * We no longer have anyone receiving data on this connection. + /* FIXME: don't accept data after the received fin. + * + * Would checking snd_seq against fin_seq be enough? + * If so, how do we handle that case exactly? -DaveM */ + /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); - if (before(tp->rcv_nxt, sk->copied_seq)) - { - printk("*** tcp.c:tcp_data bug acked < copied\n"); + if (before(tp->rcv_nxt, sk->copied_seq)) { + printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = sk->copied_seq; } - sk->delayed_acks++; - + tp->delayed_acks++; - /* - * Now tell the user we may have some data. - */ - - if (!sk->dead) - { - if(sk->debug) - printk("Data wakeup.\n"); + /* Now tell the user we may have some data. */ + if (!sk->dead) { + SOCK_DEBUG(sk, "Data wakeup.\n"); sk->data_ready(sk,0); - } + } return(1); } @@ -1132,51 +1174,52 @@ static void tcp_data_snd_check(struct sock *sk) struct sk_buff *skb; struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - if ((skb = tp->send_head)) - { + if ((skb = tp->send_head)) { if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && - sk->packets_out < sk->cong_window ) - { - /* - * Add more data to the send queue. - */ + tp->packets_out < tp->snd_cwnd ) { + /* Add more data to the send queue. */ + /* FIXME: the congestion window is checked + * again in tcp_write_xmit anyway?! -- erics + * + * I think it must, it bumps tp->packets_out for + * each packet it fires onto the wire. -DaveM + */ tcp_write_xmit(sk); - wake_up_interruptible(sk->sleep); - } - else if (sk->packets_out == 0 && !tp->pending) - { - /* - * Data to queue but no room. - */ + if(!sk->dead) + sk->write_space(sk); + } else if (tp->packets_out == 0 && !tp->pending) { + /* Data to queue but no room. */ + + /* FIXME: Is it right to do a zero window probe into + * a congestion window limited window??? -- erics + */ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + } } -} +} static __inline__ void tcp_ack_snd_check(struct sock *sk) { - /* - * This also takes care of updating the window. - * This if statement needs to be simplified. + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* This also takes care of updating the window. + * This if statement needs to be simplified. * - * rules for delaying an ack: + * Rules for delaying an ack: * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets */ - - if (sk->delayed_acks == 0) + if (tp->delayed_acks == 0) { + /* We sent a data segment already. */ return; + } - if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) - { + if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) tcp_send_ack(sk); - } - else - { - tcp_send_delayed_ack(sk, HZ/2); - } + else + tcp_send_delayed_ack(sk, HZ/2); } /* @@ -1198,62 +1241,49 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) ptr--; ptr += ntohl(th->seq); - /* ignore urgent data that we've already seen and read */ + /* Ignore urgent data that we've already seen and read. */ if (after(sk->copied_seq, ptr)) return; - /* do we already have a newer (or duplicate) urgent pointer? */ + /* Do we already have a newer (or duplicate) urgent pointer? */ if (sk->urg_data && !after(ptr, sk->urg_seq)) return; - /* tell the world about our new urgent pointer */ + /* Tell the world about our new urgent pointer. */ if (sk->proc != 0) { - if (sk->proc > 0) { + if (sk->proc > 0) kill_proc(sk->proc, SIGURG, 1); - } else { + else kill_pg(-sk->proc, SIGURG, 1); - } } - /* - * We may be adding urgent data when the last byte read was - * urgent. To do this requires some care. We cannot just ignore - * sk->copied_seq since we would read the last urgent byte again - * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * sk->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) */ if (sk->urg_seq == sk->copied_seq) sk->copied_seq++; /* Move the copied sequence on correctly */ sk->urg_data = URG_NOTYET; sk->urg_seq = ptr; - /* disable header prediction */ + /* Disable header prediction. */ tp->pred_flags = 0; } -/* - * This is the 'fast' part of urgent handling. - */ - +/* This is the 'fast' part of urgent handling. */ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) { - /* - * Check if we get a new urgent pointer - normally not - */ - + /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk,th); - /* - * Do we wait for any urgent data? - normally not - */ - + /* Do we wait for any urgent data? - normally not... */ if (sk->urg_data == URG_NOTYET) { - u32 ptr; + u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4); - /* - * Is the urgent pointer pointing into this packet? - */ - ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4; + /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) @@ -1262,26 +1292,19 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len } } - static void prune_queue(struct sock *sk) { struct sk_buff * skb; - /* - * clean the out_of_order queue - */ - + /* Clean the out_of_order queue. */ while ((skb = skb_dequeue(&sk->out_of_order_queue))) - { kfree_skb(skb, FREE_READ); - } } - -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, __u16 len) +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, __u16 len) { - struct tcp_opt *tp; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; u32 flg; @@ -1301,6 +1324,23 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tp = &(sk->tp_pinfo.af_tcp); + + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(th,tp)) { + if (tp->saw_tstamp) { + if (tcp_paws_discard(tp)) { + if (!th->rst) { + tcp_send_ack(sk); + kfree_skb(skb, FREE_READ); + return 0; + } + } + tcp_replace_ts_recent(tp,skb->end_seq); + } + } + flg = *(((u32 *)th) + 3); /* @@ -1311,127 +1351,88 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * space for instance) */ - if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) - { - if (len <= sizeof(struct tcphdr)) - { - if (len == sizeof(struct tcphdr)) - { + if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) { + if (len <= th->doff*4) { + /* Bulk data transfer: sender */ + if (len == th->doff*4) { tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + tcp_data_snd_check(sk); } - tcp_data_snd_check(sk); - kfree_skb(skb, FREE_READ); - return; - - } - else if (skb->ack_seq == tp->snd_una) - { - /* - * Bulk data transfer: receiver - */ + return 0; + } else if (skb->ack_seq == tp->snd_una) { + /* Bulk data transfer: receiver */ - skb_pull(skb,sizeof(struct tcphdr)); + skb_pull(skb,th->doff*4); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; - sk->bytes_rcv += len - sizeof(struct tcphdr); - + sk->data_ready(sk, 0); tcp_delack_estimator(tp); - if (sk->delayed_acks++) - { + if (tp->delayed_acks++ == 0) tcp_send_delayed_ack(sk, HZ/2); - } else tcp_send_ack(sk); - return; + return 0; } } - if (!tcp_sequence(tp, skb->seq, skb->end_seq)) - { - if (!th->rst) - { - if (after(skb->seq, tp->rcv_nxt)) - { - printk(KERN_DEBUG "->seq:%d end:%d " - "wup:%d wnd:%d\n", - skb->seq, skb->end_seq, - tp->rcv_wup, tp->rcv_wnd); + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { + if (!th->rst) { + if (after(skb->seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", + skb->seq, skb->end_seq, + tp->rcv_wup, tp->rcv_wnd); } tcp_send_ack(sk); kfree_skb(skb, FREE_READ); - return; + return 0; } } - if(th->syn && skb->seq != sk->syn_seq) - { + if(th->syn && skb->seq != sk->syn_seq) { printk(KERN_DEBUG "syn in established state\n"); tcp_reset(sk, skb); kfree_skb(skb, FREE_READ); - return; + return 1; } - if(th->rst) - { + if(th->rst) { tcp_reset(sk,skb); kfree_skb(skb, FREE_READ); - return; + return 0; } if(th->ack) - { tcp_ack(sk, th, skb->seq, skb->ack_seq, len); - } - - /* - * Process urgent data - */ - + /* Process urgent data. */ tcp_urg(sk, th, len); - /* - * step 7: process the segment text - */ - - + /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); - /* - * step 8: check the FIN bit - */ - + /* step 8: check the FIN bit */ if (th->fin) - { tcp_fin(skb, sk, th); - } tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - /* - * If our receive queue has grown past its limits, - * try to prune away duplicates etc.. + /* If our receive queue has grown past its limits, + * try to prune away duplicates etc.. */ - if (sk->rmem_alloc > sk->rcvbuf) + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) prune_queue(sk); - /* - * And done - */ - - if (queued) - return; - - kfree_skb(skb, FREE_READ); + if (!queued) + kfree_skb(skb, FREE_READ); + return 0; } - /* * This function implements the receiving procedure of RFC 793. @@ -1444,49 +1445,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; - int rcv_mss; - - /* - * state == CLOSED - * tested in tcp_v{4,6}_rcv - */ + /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */ switch (sk->state) { - - case TCP_LISTEN: - if (th->rst) goto discard; - /* - * These use the socket TOS.. + /* These use the socket TOS.. * might want to be the received TOS */ - if(th->ack) - { - /* - * send reset - */ - - return 1; - } - + return 1; /* send reset */ - if(th->syn) - { - int err; - __u32 isn; - - isn = tp->af_specific->init_sequence(sk, skb); - err = tp->af_specific->conn_request(sk, skb, opt, isn); + if(th->syn) { + __u32 isn = tp->af_specific->init_sequence(sk, skb); - if (err < 0) + if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0) return 1; - /* - * Now we have several options: In theory there is + /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the * syn up to the [to be] advertised window and @@ -1498,7 +1476,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Now that TTCP is starting to be used we ought to * queue this data. */ - return 0; } @@ -1506,45 +1483,36 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_SYN_SENT: - - /* - * SYN sent means we have to look for a suitable ack and - * either reset for bad matches or go to connected. - * The SYN_SENT case is unusual and should - * not be in line code. [AC] + /* SYN sent means we have to look for a suitable ack and + * either reset for bad matches or go to connected. + * The SYN_SENT case is unusual and should + * not be in line code. [AC] */ - - if(th->ack) - { - /* We got an ack, but it's not a good ack */ - if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) - { + if(th->ack) { + tp->snd_wl1 = skb->seq; + + /* We got an ack, but it's not a good ack. */ + if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) { tcp_statistics.TcpAttemptFails++; return 1; } - if(th->rst) - { + if(th->rst) { tcp_reset(sk,skb); goto discard; } - if(!th->syn) - { - /* - * A valid ack from a different connection - * start. Shouldn't happen but cover it + if(!th->syn) { + /* A valid ack from a different connection + * start. Shouldn't happen but cover it. */ tcp_statistics.TcpAttemptFails++; return 1; } - /* - * Ok.. it's good. Set up sequence - * numbers and - * move to established. + /* Ok.. it's good. Set up sequence numbers and + * move to established. */ - tp->rcv_nxt = skb->seq+1; tp->rcv_wnd = 0; tp->rcv_wup = skb->seq+1; @@ -1553,43 +1521,53 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; - sk->fin_seq = skb->seq; - tcp_send_ack(sk); + tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); - rcv_mss = tcp_parse_options(th); - - if (rcv_mss == 0) - { - rcv_mss = 536; + tcp_parse_options(th,tp); + /* FIXME: need to make room for SACK still */ + if (tp->tstamp_ok) { + tp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: Define constant! */ + sk->dummy_th.doff += 3; /* reserve space of options */ + } else + tp->tcp_header_len = sizeof(struct tcphdr); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = jiffies; } - sk->mss = min(sk->mss, rcv_mss); + /* Can't be earlier, doff would be wrong. */ + tcp_send_ack(sk); + + if (tp->in_mss) + sk->mss = min(sk->mss, tp->in_mss); + + /* Take out space for tcp options. */ + sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr); sk->dummy_th.dest = th->source; sk->copied_seq = tp->rcv_nxt; - if(!sk->dead) - { + if(!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 0); } /* Drop through step 6 */ goto step6; - } - else - { - if(th->syn && !th->rst) - { - /* - * the previous version of the code + } else { + if(th->syn && !th->rst) { + /* The previous version of the code * checked for "connecting to self" * here. that check is done now in - * tcp_connect + * tcp_connect. */ - tcp_set_state(sk, TCP_SYN_RECV); + tcp_parse_options(th,tp); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = jiffies; + } tp->rcv_nxt = skb->seq + 1; tp->rcv_wup = skb->seq + 1; @@ -1605,8 +1583,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_TIME_WAIT: - /* - * RFC 1122: + /* RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: @@ -1619,14 +1596,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ - - if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) - { + if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) { __u32 isn; - int err; - atomic_sub(skb->truesize, &sk->rmem_alloc); - skb->sk = NULL; + skb_orphan(skb); sk->err = ECONNRESET; tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; @@ -1635,56 +1608,58 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, sk = tp->af_specific->get_sock(skb, th); - if (sk == NULL) + if (sk == NULL || !ipsec_sk_policy(sk,skb)) goto discard; - skb->sk = sk; + skb_set_owner_r(skb, sk); tp = &sk->tp_pinfo.af_tcp; - atomic_add(skb->truesize, &sk->rmem_alloc); - - err = tp->af_specific->conn_request(sk, skb, opt, isn); - if (err < 0) + if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0) return 1; - return 0; } break; - } - /* - * step 1: check sequence number + /* Parse the tcp_options present on this header. + * By this point we really only expect timestamps and SACKs. + * Note that this really has to be here and not later for PAWS + * (RFC1323) to work. */ + if (tcp_fast_parse_options(th,tp)) { + /* NOTE: assumes saw_tstamp is never set if we didn't + * negotiate the option. tcp_fast_parse_options() must + * guarantee this. + */ + if (tp->saw_tstamp) { + if (tcp_paws_discard(tp)) { + if (!th->rst) { + tcp_send_ack(sk); + goto discard; + } + } + tcp_replace_ts_recent(tp,skb->end_seq); + } + } - if (!tcp_sequence(tp, skb->seq, skb->end_seq)) - { - if (!th->rst) - { + /* step 1: check sequence number */ + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { + if (!th->rst) { tcp_send_ack(sk); goto discard; } } - - /* - * step 2: check RST bit - */ - - if(th->rst) - { + /* step 2: check RST bit */ + if(th->rst) { tcp_reset(sk,skb); goto discard; } - /* - * step 3: check security and precedence - * [ignored] - */ + /* step 3: check security and precedence [ignored] */ - /* - * step 4: + /* step 4: * * Check for a SYN, and ensure it matches the SYN we were * first sent. We have to handle the rather unusual (but valid) @@ -1700,24 +1675,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * original syn. */ - if (th->syn && skb->seq!=sk->syn_seq) - { + if (th->syn && skb->seq!=sk->syn_seq) { tcp_reset(sk, skb); return 1; } - /* - * step 5: check the ACK field - */ - - if (th->ack) - { + /* step 5: check the ACK field */ + if (th->ack) { int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len); switch(sk->state) { case TCP_SYN_RECV: - if (acceptable) - { + if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); sk->dummy_th.dest=th->source; sk->copied_seq = tp->rcv_nxt; @@ -1730,36 +1699,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; - } - else + } else return 1; break; case TCP_FIN_WAIT1: - - if (tp->snd_una == sk->write_seq) - { + if (tp->snd_una == sk->write_seq) { sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) + if (!sk->dead) sk->state_change(sk); } break; - case TCP_CLOSING: - - if (tp->snd_una == sk->write_seq) - { + case TCP_CLOSING: + if (tp->snd_una == sk->write_seq) tcp_time_wait(sk); - if (!sk->dead) - sk->state_change(sk); - } break; case TCP_LAST_ACK: - - if (tp->snd_una == sk->write_seq) - { + if (tp->snd_una == sk->write_seq) { sk->shutdown = SHUTDOWN_MASK; tcp_set_state(sk,TCP_CLOSE); if (!sk->dead) @@ -1769,49 +1728,34 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_TIME_WAIT: - /* - * keep us in TIME_WAIT until we stop getting + /* Keep us in TIME_WAIT until we stop getting * packets, reset the timeout. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); break; - } - } - else + } else goto discard; - step6: - - /* - * step 6: check the URG bit - */ - +step6: + /* step 6: check the URG bit */ tcp_urg(sk, th, len); - /* - * step 7: process the segment text - */ - + /* step 7: process the segment text */ switch (sk->state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: - if (!before(skb->seq, sk->fin_seq)) + if (!before(skb->seq, tp->fin_seq)) break; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: - - /* - * RFC 793 says to queue data in this states, - * RFC 1122 says we MUST send a reset. - * BSD 4.4 also does reset. + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. */ - - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) - { - if (after(skb->end_seq - th->fin, tp->rcv_nxt)) - { + if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { + if (after(skb->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk, skb); return 1; } @@ -1819,25 +1763,19 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); - break; + break; } - /* - * step 8: check the FIN bit - */ - + /* step 8: check the FIN bit */ if (th->fin) - { tcp_fin(skb, sk, th); - } tcp_data_snd_check(sk); tcp_ack_snd_check(sk); if (queued) return 0; - discard: - +discard: kfree_skb(skb, FREE_READ); return 0; } @@ -1847,30 +1785,22 @@ int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, { int val = sysctl_tcp_cong_avoidance; int retv; - - retv = proc_dointvec(ctl, write, filp, buffer, lenp); - - if (write) - { + + retv = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write) { switch (sysctl_tcp_cong_avoidance) { - case 0: - tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; - break; - case 1: - tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas; - break; - default: - retv = -EINVAL; - sysctl_tcp_cong_avoidance = val; - } + case 0: + tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; + break; + case 1: + tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas; + break; + default: + retv = -EINVAL; + sysctl_tcp_cong_avoidance = val; + }; } - + return retv; } - -/* - * Local variables: - * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c" - * c-file-style: "Linux" - * End: - */ |