/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, * Mark Evans, * Corey Minyard * Florian La Roche, * Charles Hedrick, * Linus Torvalds, * Alan Cox, * Matthew Dillon, * Arnt Gulbrandsen, * Jorge Cwik, */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. */ #include #include #include #include #include typedef void (*tcp_sys_cong_ctl_t)(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt); static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt); static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt); #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else #define SYNC_INIT 1 #endif extern int sysctl_tcp_fin_timeout; /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM */ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; int sysctl_tcp_hoe_retransmits = 1; int sysctl_tcp_cong_avoidance; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_delack_estimator(struct tcp_opt *tp) { if(tp->ato == 0) { tp->lrcvtime = jiffies; /* Help sender leave slow start quickly, * this sets our initial ato value. */ tcp_enter_quickack_mode(tp); } else { int m = jiffies - tp->lrcvtime; tp->lrcvtime = jiffies; if(m <= 0) m = 1; if(m > tp->rto) tp->ato = tp->rto; else tp->ato = (tp->ato >> 1) + m; /* We are not in "quick ack" mode. */ if(tp->ato <= (HZ/100)) tp->ato = ((HZ/100)*2); } } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev */ if(m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<2; /* make sure rto = 3*rtt */ } } /* Calculate rto without backoff. This is the second half of Van Jacobsons * routine refered to above. */ static __inline__ void tcp_set_rto(struct tcp_opt *tp) { tp->rto = (tp->srtt >> 3) + tp->mdev; tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); } /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound * on packet lifetime in the internet. We need the HZ/5 lower * bound to behave correctly against BSD stacks with a fixed * delayed ack. * FIXME: It's not entirely clear this lower bound is the best * way to avoid the problem. Is it possible to drop the lower * bound and still avoid trouble with BSD stacks? Perhaps * some modification to the RTO calculation that takes delayed * ack bais into account? This needs serious thought. -- erics */ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { if (tp->rto > 120*HZ) tp->rto = 120*HZ; if (tp->rto < HZ/5) tp->rto = HZ/5; } /* WARNING: this must not be called if tp->saw_timestamp was false. */ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) { /* From draft-ietf-tcplw-high-performance: the correct * test is last_ack_sent <= end_seq. * (RFC1323 stated last_ack_sent < end_seq.) */ if (!before(end_seq,tp->last_ack_sent)) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = jiffies; } } #define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) extern __inline__ int tcp_paws_discard(struct tcp_opt *tp) { /* ts_recent must be younger than 24 days */ return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) || ((s32)(tp->rcv_tsval-tp->ts_recent) < 0)); } static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { u32 end_window = tp->rcv_wup + tp->rcv_wnd; if (tp->rcv_wnd) { if (!before(seq, tp->rcv_nxt) && before(seq, end_window)) return 1; if ((end_seq - seq) && after(end_seq, tp->rcv_nxt) && !after(end_seq, end_window)) return 1; } return 0; } /* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { if (seq == tp->rcv_nxt) return (tp->rcv_wnd || (end_seq == seq)); return __tcp_sequence(tp, seq, end_seq); } /* When we get a reset we do this. */ static void tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_SYN_SENT: sk->err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: sk->err = EPIPE; break; default: sk->err = ECONNRESET; }; tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); } /* This tags the retransmission queue when SACKs arrive. */ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int i = nsacks; while(i--) { struct sk_buff *skb = skb_peek(&sk->write_queue); __u32 start_seq = ntohl(sp->start_seq); __u32 end_seq = ntohl(sp->end_seq); while((skb != NULL) && (skb != tp->send_head) && (skb != (struct sk_buff *)&sk->write_queue)) { /* We play conservative, we don't allow SACKS to partially * tag a sequence space. */ if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq)) TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; skb = skb->next; } sp++; /* Move on to the next SACK block. */ } } /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); tp->saw_tstamp = 0; while(length>0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) break; /* don't parse partial options */ switch(opcode) { case TCPOPT_MSS: if(opsize==TCPOLEN_MSS && th->syn) { tp->in_mss = ntohs(*(__u16 *)ptr); if (tp->in_mss == 0) tp->in_mss = 536; } break; case TCPOPT_WINDOW: if(opsize==TCPOLEN_WINDOW && th->syn) if (!no_fancy && sysctl_tcp_window_scaling) { tp->wscale_ok = 1; tp->snd_wscale = *(__u8 *)ptr; } break; case TCPOPT_TIMESTAMP: if(opsize==TCPOLEN_TIMESTAMP) { if (sysctl_tcp_timestamps && !no_fancy) { tp->tstamp_ok = 1; tp->saw_tstamp = 1; tp->rcv_tsval = ntohl(*(__u32 *)ptr); tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); } } break; case TCPOPT_SACK_PERM: if(opsize==TCPOLEN_SACK_PERM && th->syn) { if (sysctl_tcp_sack && !no_fancy) { tp->sack_ok = 1; tp->num_sacks = 0; } } break; case TCPOPT_SACK: if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && sysctl_tcp_sack && (sk != NULL) && !th->syn) { int sack_bytes = opsize - TCPOLEN_SACK_BASE; if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { int num_sacks = sack_bytes >> 3; struct tcp_sack_block *sackp; sackp = (struct tcp_sack_block *)ptr; tcp_sacktag_write_queue(sk, sackp, num_sacks); } } }; ptr+=opsize-2; length-=opsize; }; } } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) { /* If we didn't send out any options ignore them all. */ if (tp->tcp_header_len == sizeof(struct tcphdr)) return 0; if (th->doff == sizeof(struct tcphdr)>>2) { tp->saw_tstamp = 0; return 0; } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { __u32 *ptr = (__u32 *)(th + 1); if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->saw_tstamp = 1; tp->rcv_tsval = ntohl(*++ptr); tp->rcv_tsecr = ntohl(*++ptr); return 1; } } tcp_parse_options(sk, th, tp, 0); return 1; } #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ static __inline__ void clear_fast_retransmit(struct sock *sk) { struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); if (tp->dup_acks > 3) { tp->retrans_head = NULL; tp->snd_cwnd = max(tp->snd_ssthresh, 1); } tp->dup_acks = 0; } /* NOTE: This code assumes that tp->dup_acks gets cleared when a * retransmit timer fires. */ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* * Note: If not_dup is set this implies we got a * data carrying packet or a window update. * This carries no new information about possible * lost packets, so we have to ignore it for the purposes * of counting duplicate acks. Ideally this does not imply we * should stop our fast retransmit phase, more acks may come * later without data to help us. Unfortunately this would make * the code below much more complex. For now if I see such * a packet I clear the fast retransmit phase. */ if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { /* This is the standard reno style fast retransmit branch. */ /* 1. When the third duplicate ack is received, set ssthresh * to one half the current congestion window, but no less * than two segments. Retransmit the missing segment. */ if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; if (tp->dup_acks == 3) { tp->dup_acks++; tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh + 3; tp->high_seq = tp->snd_nxt; tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } /* 2. Each time another duplicate ACK arrives, increment * cwnd by the segment size. [...] Transmit a packet... * * Packet transmission will be done on normal flow processing * since we're not in "retransmit mode" */ if (tp->dup_acks > 3) tp->snd_cwnd++; } else if (tp->high_seq != 0) { /* In this branch we deal with clearing the Floyd style * block on duplicate fast retransmits, and if requested * we do Hoe style secondary fast retransmits. */ if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { /* Once we have acked all the packets up to high_seq * we are done this fast retransmit phase. * Alternatively data arrived. In this case we * Have to abort the fast retransmit attempt. * Note that we do want to accept a window * update since this is expected with Hoe's algorithm. */ clear_fast_retransmit(sk); /* After we have cleared up to high_seq we can * clear the Floyd style block. */ if (after(ack, tp->high_seq)) tp->high_seq = 0; } else if (tp->dup_acks >= 3) { if (sysctl_tcp_hoe_retransmits) { /* Hoe Style. We didn't ack the whole * window. Take this as a cue that * another packet was lost and retransmit it. * Don't muck with the congestion window here. * Note that we have to be careful not to * act if this was a window update and it * didn't ack new data, since this does * not indicate a packet left the system. * We can test this by just checking * if ack changed from snd_una, since * the only way to get here without advancing * from snd_una is if this was a window update. */ if (ack != tp->snd_una && before(ack, tp->high_seq)) { tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } else { /* Reno style. We didn't ack the whole * window, now we have to drop out of * fast retransmit and wait for a timeout. */ clear_fast_retransmit(sk); } } } } /* * TCP slow start and congestion avoidance in two flavors: * RFC 1122 and TCP Vegas. * * This is a /proc/sys configurable option. */ #define SHIFT_FACTOR 16 static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int actual, expected; unsigned int inv_rtt, inv_basertt, inv_basebd; u32 snt_bytes; /* From: * TCP Vegas: New Techniques for Congestion * Detection and Avoidance. * * Warning: This code is a scratch implementation taken * from the paper only. The code they distribute seams * to have improved several things over the initial spec. */ if (!seq_rtt) seq_rtt = 1; if (tp->basertt) tp->basertt = min(seq_rtt, tp->basertt); else tp->basertt = seq_rtt; /* actual = throughput for this segment. * expected = number_of_bytes in transit / BaseRTT */ snt_bytes = ack - seq; inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt; inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt; actual = snt_bytes * inv_rtt; expected = (tp->snd_nxt - tp->snd_una) * inv_basertt; inv_basebd = sk->mss * inv_basertt; /* Slow Start */ if (tp->snd_cwnd < tp->snd_ssthresh && (seq == tp->snd_nxt || (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) { /* "Vegas allows exponential growth only every other RTT" */ if (tp->snd_cwnd_cnt++) { tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } else { /* Congestion Avoidance */ if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) { /* Increase Linearly */ if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) { /* Decrease Linearly */ if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { tp->snd_cwnd--; tp->snd_cwnd_cnt = 0; } /* Never less than 2 segments. */ if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; } } } static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. Because we keep cong_window in * integral mss's, we can't do cwnd += 1 / cwnd. * Instead, maintain a counter and increment it once every * cwnd times. * FIXME: Check to be sure the mathematics works out right * on this trick when we have to reduce the congestion window. * The snd_cwnd_cnt has to be reset properly when reduction events * happen. * FIXME: What happens when the congestion window gets larger * than the maximum receiver window by some large factor * Suppose the pipeline never looses packets for a long * period of time, then traffic increases causing packet loss. * The congestion window should be reduced, but what it should * be reduced to is not clear, since 1/2 the old window may * still be larger than the maximum sending rate we ever achieved. */ if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ tp->snd_cwnd++; } else { /* In dangerous area, increase slowly. In theory this is * tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt++; } } /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, __u32 *seq_rtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; unsigned long now = jiffies; int acked = 0; while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { /* If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived at * the other end. */ if (after(skb->end_seq, ack)) break; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not * true data, and if we misinform our callers that * this ACK acks real data, we will erroneously exit * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) acked |= FLAG_RETRANS_DATA_ACKED; } else { tp->retrans_head = NULL; } tp->packets_out--; *seq = skb->seq; *seq_rtt = now - skb->when; skb_unlink(skb); kfree_skb(skb); } if (acked) tp->retrans_head = NULL; return acked; } static void tcp_ack_probe(struct sock *sk, __u32 ack) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Our probe was answered. */ tp->probes_out = 0; /* Was it a usable window open? */ /* should always be non-null */ if (tp->send_head != NULL && !before (ack + tp->snd_wnd, tp->send_head->end_seq)) { tp->backoff = 0; tp->pending = 0; tcp_clear_xmit_timer(sk, TIME_PROBE0); } else { tcp_reset_xmit_timer(sk, TIME_PROBE0, min(tp->rto << tp->backoff, 120*HZ)); } } /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, u32 seq, u32 ack, int flag) { __u32 seq_rtt = (jiffies-tp->rcv_tsecr); tcp_rtt_estimator(tp, seq_rtt); if (tp->retransmits) { if (tp->packets_out == 0) { tp->retransmits = 0; tp->backoff = 0; tcp_set_rto(tp); } else { /* Still retransmitting, use backoff */ tcp_set_rto(tp); tp->rto = tp->rto << tp->backoff; } } else { tcp_set_rto(tp); if (flag & FLAG_DATA_ACKED) (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); } /* NOTE: safe here so long as cong_ctl doesn't use rto */ tcp_bound_rto(tp); } static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); long when = tp->rto - (jiffies - skb->when); /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The * congestion window is handled properly by that code. */ if (tp->retransmits) { tp->retrans_head = NULL; tcp_xmit_retransmit_queue(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack_seq, u32 ack, int len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int flag = 0; u32 seq = 0; u32 seq_rtt = 0; if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ if (tp->pending == TIME_KEEPOPEN) tp->probes_out = 0; tp->rcv_tstamp = jiffies; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. */ if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) goto uninteresting_ack; dst_confirm(sk->dst_cache); /* If there is data set flag 1 */ if (len != th->doff*4) { flag |= FLAG_DATA; tcp_delack_estimator(tp); } /* Update our send window. */ /* This is the window update code as per RFC 793 * snd_wl{1,2} are used to prevent unordered * segments from shrinking the window */ if (before(tp->snd_wl1, ack_seq) || (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { u32 nwin = ntohs(th->window) << tp->snd_wscale; if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { flag |= FLAG_WIN_UPDATE; tp->snd_wnd = nwin; tp->snd_wl1 = ack_seq; tp->snd_wl2 = ack; if (nwin > tp->max_window) tp->max_window = nwin; } } /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->err_soft = 0; /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ if (tp->pending == TIME_PROBE0) tcp_ack_probe(sk, ack); /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); } else { /* If we were retransmiting don't count rtt estimate. */ if (tp->retransmits) { if (tp->packets_out == 0) tp->retransmits = 0; } else { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine * rtt estimates. Also, we must not reset the * backoff for rto until we get a non-retransmitted * packet. This allows us to deal with a situation * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ if ((flag & FLAG_DATA_ACKED) && !(flag & FLAG_RETRANS_DATA_ACKED)) { tp->backoff = 0; tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); tcp_bound_rto(tp); (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); } } } if (tp->packets_out) { if (flag & FLAG_DATA_ACKED) tcp_ack_packets_out(sk, tp); } else { tcp_clear_xmit_timer(sk, TIME_RETRANS); } flag &= (FLAG_DATA | FLAG_WIN_UPDATE); if ((ack == tp->snd_una && tp->packets_out && flag == 0) || (tp->high_seq != 0)) { tcp_fast_retrans(sk, ack, flag); } else { /* Clear any aborted fast retransmit starts. */ tp->dup_acks = 0; } /* Remember the highest ack received. */ tp->snd_una = ack; return 1; uninteresting_ack: SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); return 0; } /* New-style handling of TIME_WAIT sockets. */ static void tcp_timewait_kill(unsigned long __arg) { struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg; /* Zap the timer. */ del_timer(&tw->timer); /* Unlink from various places. */ if(tw->bind_next) tw->bind_next->bind_pprev = tw->bind_pprev; *(tw->bind_pprev) = tw->bind_next; if(tw->tb->owners == NULL) tcp_inc_slow_timer(TCP_SLT_BUCKETGC); if(tw->next) tw->next->pprev = tw->pprev; *tw->pprev = tw->next; /* We decremented the prot->inuse count when we entered TIME_WAIT * and the sock from which this came was destroyed. */ tw->sklist_next->sklist_prev = tw->sklist_prev; tw->sklist_prev->sklist_next = tw->sklist_next; /* Ok, now free it up. */ kmem_cache_free(tcp_timewait_cachep, tw); } /* We come here as a special case from the AF specific TCP input processing, * and the SKB has no owner. Essentially handling this is very simple, * we just keep silently eating rx'd packets until none show up for the * entire timeout period. The only special cases are for BSD TIME_WAIT * reconnects and SYN/RST bits being set in the TCP header. */ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct tcphdr *th, void *opt, __u16 len) { /* RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) { struct sock *sk; struct tcp_func *af_specific = tw->af_specific; __u32 isn; isn = tw->rcv_nxt + 128000; if(isn == 0) isn++; tcp_timewait_kill((unsigned long)tw); sk = af_specific->get_sock(skb, th); if(sk == NULL || !ipsec_sk_policy(sk,skb)) return 0; skb_set_owner_r(skb, sk); af_specific = sk->tp_pinfo.af_tcp.af_specific; if(af_specific->conn_request(sk, skb, opt, isn) < 0) return 1; /* Toss a reset back. */ return 0; /* Discard the frame. */ } /* Check RST or SYN */ if(th->rst || th->syn) { /* This is TIME_WAIT assasination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if(sysctl_tcp_rfc1337 == 0) tcp_timewait_kill((unsigned long)tw); if(!th->rst) return 1; /* toss a reset back */ } else { if(th->ack) { /* In this case we must reset the TIMEWAIT timer. */ mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN); } } return 0; /* Discard the frame. */ } /* Enter the time wait state. This is always called from BH * context. Essentially we whip up a timewait bucket, copy the * relevant info into it from the SK, and mess with hash chains * and list linkage. */ static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { struct sock **head, *sktw; /* Step 1: Remove SK from established hash. */ if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; tcp_reg_zap(sk); /* Step 2: Put TW into bind hash where SK was. */ tw->tb = (struct tcp_bind_bucket *)sk->prev; if((tw->bind_next = sk->bind_next) != NULL) sk->bind_next->bind_pprev = &tw->bind_next; tw->bind_pprev = sk->bind_pprev; *sk->bind_pprev = (struct sock *)tw; /* Step 3: Same for the protocol sklist. */ (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw; (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw; sk->sklist_next = NULL; sk->prot->inuse--; /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)]; sktw = (struct sock *)tw; if((sktw->next = *head) != NULL) (*head)->pprev = &sktw->next; *head = sktw; sktw->pprev = head; } void tcp_time_wait(struct sock *sk) { struct tcp_tw_bucket *tw; tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); if(tw != NULL) { /* Give us an identity. */ tw->daddr = sk->daddr; tw->rcv_saddr = sk->rcv_saddr; tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; tw->sport = sk->sport; tw->dport = sk->dport; tw->family = sk->family; tw->reuse = sk->reuse; tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if(tw->family == AF_INET6) { memcpy(&tw->v6_daddr, &sk->net_pinfo.af_inet6.daddr, sizeof(struct in6_addr)); memcpy(&tw->v6_rcv_saddr, &sk->net_pinfo.af_inet6.rcv_saddr, sizeof(struct in6_addr)); } #endif /* Linkage updates. */ tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ init_timer(&tw->timer); tw->timer.function = tcp_timewait_kill; tw->timer.data = (unsigned long) tw; tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; add_timer(&tw->timer); /* CLOSE the SK. */ if(sk->state == TCP_ESTABLISHED) tcp_statistics.TcpCurrEstab--; sk->state = TCP_CLOSE; net_reset_timer(sk, TIME_DONE, min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); } else { /* Sorry, we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ tcp_set_state(sk, TCP_CLOSE); } /* Prevent rcvmsg/sndmsg calls, and wake people up. */ sk->shutdown = SHUTDOWN_MASK; if(!sk->dead) sk->state_change(sk); } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq; tcp_send_ack(sk); if (!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 1); } switch(sk->state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); if (th->rst) sk->shutdown = SHUTDOWN_MASK; break; case TCP_CLOSE_WAIT: case TCP_CLOSING: /* Received a retransmission of the FIN, do * nothing. */ break; case TCP_LAST_ACK: /* RFC793: Remain in the LAST-ACK state. */ break; case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. * * This causes a WRITE timeout, which will either * move on to TIME_WAIT when we timeout, or resend * the FIN properly (maybe we get rid of that annoying * FIN lost hang). The TIME_WRITE code is already * correct for handling this timeout. */ tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_time_wait(sk); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these * cases we should never reach this piece of code. */ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); break; }; } /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) { int this_sack, num_sacks = tp->num_sacks; struct tcp_sack_block *swalk = &tp->selective_acks[0]; /* If more than one SACK block, see if the recent change to SP eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ if(num_sacks != 1) { for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { if(swalk == sp) continue; /* First case, bottom of SP moves into top of the * sequence space of SWALK. */ if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { sp->start_seq = swalk->start_seq; goto coalesce; } /* Second case, top of SP moves into bottom of the * sequence space of SWALK. */ if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { sp->end_seq = swalk->end_seq; goto coalesce; } } } /* SP is the only SACK, or no coalescing cases found. */ return; coalesce: /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ for(this_sack += 1; this_sack < num_sacks; this_sack++, swalk++) { struct tcp_sack_block *next = (swalk + 1); swalk->start_seq = next->start_seq; swalk->end_seq = next->end_seq; } tp->num_sacks--; } static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; tmp = sack1->start_seq; sack1->start_seq = sack2->start_seq; sack2->start_seq = tmp; tmp = sack1->end_seq; sack1->end_seq = sack2->end_seq; sack2->end_seq = tmp; } static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_sack_block *sp = &tp->selective_acks[0]; /* Optimize for the common case, new ofo frames arrive * "in order". ;-) This also satisfies the requirements * of RFC2018 about ordering of SACKs. */ if(sp->end_seq == skb->seq) { sp->end_seq = skb->end_seq; tcp_sack_maybe_coalesce(tp, sp); } else if(sp->start_seq == skb->end_seq) { /* Re-ordered arrival, in this case, can be optimized * as well. */ sp->start_seq = skb->seq; tcp_sack_maybe_coalesce(tp, sp); } else { int cur_sacks = tp->num_sacks; int max_sacks = (tp->tstamp_ok ? 3 : 4); /* Oh well, we have to move things around. * Try to find a SACK we can tack this onto. */ if(cur_sacks > 1) { struct tcp_sack_block *swap = sp + 1; int this_sack; for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { if((swap->end_seq == skb->seq) || (swap->start_seq == skb->end_seq)) { if(swap->end_seq == skb->seq) swap->end_seq = skb->end_seq; else swap->start_seq = skb->seq; tcp_sack_swap(sp, swap); tcp_sack_maybe_coalesce(tp, sp); return; } } } /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. */ while(cur_sacks >= 1) { struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; struct tcp_sack_block *prev = (this - 1); this->start_seq = prev->start_seq; this->end_seq = prev->end_seq; cur_sacks--; } /* Build head SACK, and we're done. */ sp->start_seq = skb->seq; sp->end_seq = skb->end_seq; if(tp->num_sacks < max_sacks) tp->num_sacks++; } } static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->num_sacks; int this_sack; /* We know this removed SKB will eat from the front of a SACK. */ for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { if(sp->start_seq == skb->seq) break; } /* This should only happen if so many SACKs get built that some get * pushed out before we get here, or we eat some in sequence packets * which are before the first SACK block. */ if(this_sack >= num_sacks) return; sp->start_seq = skb->end_seq; if(!before(sp->start_seq, sp->end_seq)) { /* Zap this SACK, by moving forward any other SACKS. */ for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { struct tcp_sack_block *next = (sp + 1); sp->start_seq = next->start_seq; sp->end_seq = next->end_seq; } tp->num_sacks--; } } static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->num_sacks; int this_sack; for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) { if(sp->end_seq == old_skb->end_seq) break; } if(this_sack >= num_sacks) return; sp->end_seq = new_skb->end_seq; } /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); while ((skb = skb_peek(&tp->out_of_order_queue))) { if (after(skb->seq, tp->rcv_nxt)) break; if (!after(skb->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); skb_unlink(skb); kfree_skb(skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); if(tp->sack_ok) tcp_sack_remove_skb(tp, skb); skb_unlink(skb); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); } } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct sk_buff *skb1; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. * Out of sequence packets to out_of_order_queue. */ if (skb->seq == tp->rcv_nxt) { /* Ok. In sequence. */ queue_and_out: dst_confirm(sk->dst_cache); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; if(skb->h.th->fin) { tcp_fin(skb, sk, skb->h.th); } else { tp->delayed_acks++; /* Tiny-grams with PSH set make us ACK quickly. */ if(skb->h.th->psh && (skb->len < (sk->mss >> 1))) tp->ato = HZ/50; } /* This may have eaten into a SACK block. */ if(tp->sack_ok && tp->num_sacks) tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); if (skb_queue_len(&tp->out_of_order_queue) == 0) tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | (0x10 << 16) | tp->snd_wnd); return; } /* An old packet, either a retransmit or some packet got lost. */ if (!after(skb->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an imediate ack. */ SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq); tcp_enter_quickack_mode(tp); kfree_skb(skb); return; } if (before(skb->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); goto queue_and_out; } /* Ok. This is an out_of_order segment, force an ack. */ tp->delayed_acks++; tcp_enter_quickack_mode(tp); /* Disable header predition. */ tp->pred_flags = 0; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { tp->num_sacks = 1; tp->selective_acks[0].start_seq = skb->seq; tp->selective_acks[0].end_seq = skb->end_seq; } skb_queue_head(&tp->out_of_order_queue,skb); } else { for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ if (skb->seq == skb1->seq) { if (skb->len >= skb1->len) { if(tp->sack_ok) tcp_sack_extend(tp, skb1, skb); skb_append(skb1, skb); skb_unlink(skb1); kfree_skb(skb1); } else { /* A duplicate, smaller than what is in the * out-of-order queue right now, toss it. */ kfree_skb(skb); } break; } if (after(skb->seq, skb1->seq)) { skb_append(skb1,skb); if(tp->sack_ok) tcp_sack_new_ofo_skb(sk, skb); break; } /* See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&tp->out_of_order_queue)) { skb_queue_head(&tp->out_of_order_queue,skb); if(tp->sack_ok) tcp_sack_new_ofo_skb(sk, skb); break; } } } } /* * This routine handles the data. If there is room in the buffer, * it will be have already been moved into it. If there is no * room, then we will just have to discard the packet. */ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); th = skb->h.th; skb_pull(skb, th->doff*4); skb_trim(skb, len - (th->doff*4)); if (skb->len == 0 && !th->fin) return(0); /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); if (before(tp->rcv_nxt, tp->copied_seq)) { printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = tp->copied_seq; } /* Above, tcp_data_queue() increments delayed_acks appropriately. * Now tell the user we may have some data. */ if (!sk->dead) { SOCK_DEBUG(sk, "Data wakeup.\n"); sk->data_ready(sk,0); } return(1); } static void tcp_data_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; if ((skb = tp->send_head)) { if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && tp->packets_out < tp->snd_cwnd ) { /* Put more data onto the wire. */ tcp_write_xmit(sk); } else if (tp->packets_out == 0 && !tp->pending) { /* Start probing the receivers window. */ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } } } /* * Check if sending an ack is needed. */ static __inline__ void __tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* This also takes care of updating the window. * This if statement needs to be simplified. * * Rules for delaying an ack: * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets * - must send an ACK if we have any SACKs * * With an extra heuristic to handle loss of packet * situations and also helping the sender leave slow * start in an expediant manner. */ /* Two full frames received or... */ if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || /* We entered "quick ACK" mode or... */ tcp_in_quickack_mode(tp) || /* We have pending SACKs */ (tp->sack_ok && tp->num_sacks)) { /* Then ack it now */ tcp_send_ack(sk); } else { /* Else, send delayed ack. */ tcp_send_delayed_ack(tp, HZ/2); } } static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); if (tp->delayed_acks == 0) { /* We sent a data segment already. */ return; } __tcp_ack_snd_check(sk); } /* * This routine is only called when we have urgent data * signalled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. * For 1003.1g we should support a new option TCP_STDURG to permit * either form (or just set the sysctl tcp_stdurg). */ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 ptr = ntohs(th->urg_ptr); if (ptr && !sysctl_tcp_stdurg) ptr--; ptr += ntohl(th->seq); /* Ignore urgent data that we've already seen and read. */ if (after(tp->copied_seq, ptr)) return; /* Do we already have a newer (or duplicate) urgent pointer? */ if (tp->urg_data && !after(ptr, tp->urg_seq)) return; /* Tell the world about our new urgent pointer. */ if (sk->proc != 0) { if (sk->proc > 0) kill_proc(sk->proc, SIGURG, 1); else kill_pg(-sk->proc, SIGURG, 1); } /* We may be adding urgent data when the last byte read was * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the sematics of SIOCATMARK (and thus sockatmark()) */ if (tp->urg_seq == tp->copied_seq) tp->copied_seq++; /* Move the copied sequence on correctly */ tp->urg_data = URG_NOTYET; tp->urg_seq = ptr; /* Disable header prediction. */ tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ if (tp->urg_data == URG_NOTYET) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } } } /* * Clean first the out_of_order queue, then the receive queue until * the socket is in its memory limits again. */ static void prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff * skb; SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); /* First Clean the out_of_order queue. */ /* Start with the end because there are probably the least * useful packets (crossing fingers). */ while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) return; } /* Now continue with the receive queue if it wasn't enough */ while ((skb = skb_peek_tail(&sk->receive_queue))) { /* Never toss anything when we've seen the FIN. * It's just too complex to recover from it. */ if(skb->h.th->fin) break; /* Never remove packets that have been already acked */ if (before(skb->end_seq, tp->last_ack_sent+1)) { printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", tp->copied_seq, skb->end_seq, tp->last_ack_sent); break; } skb_unlink(skb); tp->rcv_nxt = skb->seq; SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", skb->seq, skb->end_seq, tp->copied_seq); kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) break; } } int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, __u16 len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; u32 flg; /* * Header prediction. * The code follows the one in the famous * "30 instruction TCP receive" Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue * on a device interrupt, to call tcp_recv function * on the receive process context and checksum and copy * the buffer to user space. smart... * * Our current scheme is not silly either but we take the * extra cost of the net_bh soft interrupt processing... * We do checksum and copy also but from device to kernel. */ /* * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(sk, th, tp)) { if (tp->saw_tstamp) { if (tcp_paws_discard(tp)) { if (!th->rst) { tcp_send_ack(sk); goto discard; } } tcp_replace_ts_recent(tp,skb->end_seq); } } flg = *(((u32 *)th) + 3); /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 else it will be !0 * (when there are holes in the receive * space for instance) */ if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) { if (len <= th->doff*4) { /* Bulk data transfer: sender */ if (len == th->doff*4) { tcp_ack(sk, th, skb->seq, skb->ack_seq, len); kfree_skb(skb); tcp_data_snd_check(sk); return 0; } else { /* Header too small */ tcp_statistics.TcpInErrs++; goto discard; } } else if (skb->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) goto discard; skb_pull(skb,th->doff*4); /* DO NOT notify forward progress here. * It saves dozen of CPU instructions in fast path. --ANK */ skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; /* FIN bit check is not done since if FIN is set in * this frame, the pred_flags won't match up. -DaveM */ sk->data_ready(sk, 0); tcp_delack_estimator(tp); /* Tiny-grams with PSH set make us ACK quickly. */ if(th->psh && (skb->len < (sk->mss >> 1))) tp->ato = HZ/50; tp->delayed_acks++; __tcp_ack_snd_check(sk); return 0; } } if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { if (!th->rst) { if (after(skb->seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", skb->seq, skb->end_seq, tp->rcv_wup, tp->rcv_wnd); } tcp_send_ack(sk); goto discard; } } if(th->syn && skb->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); return 1; } if(th->rst) { tcp_reset(sk,skb); goto discard; } if(th->ack) tcp_ack(sk, th, skb->seq, skb->ack_seq, len); /* Process urgent data. */ tcp_urg(sk, th, len); /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); tcp_data_snd_check(sk); /* If our receive queue has grown past its limits shrink it */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) prune_queue(sk); tcp_ack_snd_check(sk); if (!queued) { discard: kfree_skb(skb); } return 0; } /* Shared between IPv4 and IPv6 now. */ struct sock * tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're * running from a soft interrupt. */ if (req->sk) { /* socket already created but not * yet accepted()... */ sk = req->sk; } else { u32 flg; /* Check for syn retransmission */ flg = *(((u32 *)skb->h.th) + 3); flg &= __constant_htonl(0x00170000); /* Only SYN set? */ if (flg == __constant_htonl(0x00020000)) { if (!after(skb->seq, req->rcv_isn)) { /* retransmited syn. */ req->class->rtx_syn_ack(sk, req); return NULL; } else { return sk; /* New SYN */ } } /* We know it's an ACK here */ /* In theory the packet could be for a cookie, but * TIME_WAIT should guard us against this. * XXX: Nevertheless check for cookies? * This sequence number check is done again later, * but we do it here to prevent syn flood attackers * from creating big SYN_RECV sockets. */ if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) || !between(skb->seq, req->rcv_isn, req->rcv_isn+1+req->rcv_wnd)) { req->class->send_reset(skb); return NULL; } sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); tcp_dec_slow_timer(TCP_SLT_SYNACK); if (sk == NULL) return NULL; req->expires = 0UL; req->sk = sk; } skb_orphan(skb); skb_set_owner_r(skb, sk); return sk; } /* * This function implements the receiving procedure of RFC 793. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, void *opt, __u16 len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */ switch (sk->state) { case TCP_LISTEN: /* These use the socket TOS.. * might want to be the received TOS */ if(th->ack) return 1; if(th->syn) { if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0) return 1; /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the * syn up to the [to be] advertised window and * Solaris 2.1 gives you a protocol error. For now * we just ignore it, that fits the spec precisely * and avoids incompatibilities. It would be nice in * future to drop through and process the data. * * Now that TTCP is starting to be used we ought to * queue this data. * But, this leaves one open to an easy denial of * service attack, and SYN cookies can't defend * against this problem. So, we drop the data * in the interest of security over speed. */ goto discard; } goto discard; break; case TCP_SYN_SENT: /* SYN sent means we have to look for a suitable ack and * either reset for bad matches or go to connected. * The SYN_SENT case is unusual and should * not be in line code. [AC] */ if(th->ack) { tp->snd_wl1 = skb->seq; /* We got an ack, but it's not a good ack. */ if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) { tcp_statistics.TcpAttemptFails++; return 1; } if(th->rst) { tcp_reset(sk,skb); goto discard; } if(!th->syn) { /* A valid ack from a different connection * start. Shouldn't happen but cover it. */ tcp_statistics.TcpAttemptFails++; return 1; } /* Ok.. it's good. Set up sequence numbers and * move to established. */ tp->rcv_nxt = skb->seq+1; tp->rcv_wup = skb->seq+1; tp->snd_wnd = htons(th->window) << tp->snd_wscale; tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); tcp_parse_options(sk, th, tp, 0); if (tp->wscale_ok == 0) { tp->snd_wscale = tp->rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp,65535); } if (tp->tstamp_ok) { tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else tp->tcp_header_len = sizeof(struct tcphdr); if (tp->saw_tstamp) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = jiffies; } /* Can't be earlier, doff would be wrong. */ tcp_send_ack(sk); /* Check for the case where we tried to advertise * a window including timestamp options, but did not * end up using them for this connection. */ if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps) sk->mss += TCPOLEN_TSTAMP_ALIGNED; /* Now limit it if the other end negotiated a smaller * value. */ if (tp->in_mss) { int real_mss = tp->in_mss; /* We store MSS locally with the timestamp bytes * subtracted, TCP's advertise it with them * included. Account for this fact. */ if(tp->tstamp_ok) real_mss -= TCPOLEN_TSTAMP_ALIGNED; sk->mss = min(sk->mss, real_mss); } sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 0); } /* Drop through step 6 */ goto step6; } else { if(th->syn && !th->rst) { /* The previous version of the code * checked for "connecting to self" * here. that check is done now in * tcp_connect. */ tcp_set_state(sk, TCP_SYN_RECV); tcp_parse_options(sk, th, tp, 0); if (tp->saw_tstamp) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = jiffies; } tp->rcv_nxt = skb->seq + 1; tp->rcv_wup = skb->seq + 1; tp->snd_wnd = htons(th->window); tp->snd_wl1 = skb->seq; tcp_send_synack(sk); goto discard; } } break; } /* Parse the tcp_options present on this header. * By this point we really only expect timestamps. * Note that this really has to be here and not later for PAWS * (RFC1323) to work. */ if (tcp_fast_parse_options(sk, th, tp)) { /* NOTE: assumes saw_tstamp is never set if we didn't * negotiate the option. tcp_fast_parse_options() must * guarantee this. */ if (tp->saw_tstamp) { if (tcp_paws_discard(tp)) { if (!th->rst) { tcp_send_ack(sk); goto discard; } } tcp_replace_ts_recent(tp,skb->end_seq); } } /* step 1: check sequence number */ if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { if (!th->rst) { tcp_send_ack(sk); goto discard; } } /* step 2: check RST bit */ if(th->rst) { tcp_reset(sk,skb); goto discard; } /* step 3: check security and precedence [ignored] */ /* step 4: * * Check for a SYN, and ensure it matches the SYN we were * first sent. We have to handle the rather unusual (but valid) * sequence that KA9Q derived products may generate of * * SYN * SYN|ACK Data * ACK (lost) * SYN|ACK Data + More Data * .. we must ACK not RST... * * We keep syn_seq as the sequence space occupied by the * original syn. */ if (th->syn && skb->seq!=tp->syn_seq) { tcp_reset(sk, skb); return 1; } /* step 5: check the ACK field */ if (th->ack) { int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len); switch(sk->state) { case TCP_SYN_RECV: if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) sk->state_change(sk); tp->snd_una = skb->ack_seq; tp->snd_wnd = htons(th->window) << tp->snd_wscale; tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; } else { SOCK_DEBUG(sk, "bad ack\n"); return 1; } break; case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); if (!sk->dead) sk->state_change(sk); else tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); } break; case TCP_CLOSING: if (tp->snd_una == tp->write_seq) tcp_time_wait(sk); break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { sk->shutdown = SHUTDOWN_MASK; tcp_set_state(sk,TCP_CLOSE); if (!sk->dead) sk->state_change(sk); goto discard; } break; } } else goto discard; step6: /* step 6: check the URG bit */ tcp_urg(sk, th, len); /* step 7: process the segment text */ switch (sk->state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: if (!before(skb->seq, tp->fin_seq)) break; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { if (after(skb->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk, skb); return 1; } } case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); /* This can only happen when MTU+skbheader > rcvbuf */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) prune_queue(sk); break; } tcp_data_snd_check(sk); tcp_ack_snd_check(sk); if (!queued) { discard: kfree_skb(skb); } return 0; } int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, void *buffer, size_t *lenp) { int val = sysctl_tcp_cong_avoidance; int retv; static tcp_sys_cong_ctl_t tab[] = { tcp_cong_avoid_vanj, tcp_cong_avoid_vegas }; retv = proc_dointvec(ctl, write, filp, buffer, lenp); if (write) { if ((unsigned)sysctl_tcp_cong_avoidance > 1) { retv = -EINVAL; sysctl_tcp_cong_avoidance = val; } else { tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance]; } } return retv; }