diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1998-05-07 02:55:41 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1998-05-07 02:55:41 +0000 |
commit | dcec8a13bf565e47942a1751a9cec21bec5648fe (patch) | |
tree | 548b69625b18cc2e88c3e68d0923be546c9ebb03 /net/ipv4/tcp_output.c | |
parent | 2e0f55e79c49509b7ff70ff1a10e1e9e90a3dfd4 (diff) |
o Merge with Linux 2.1.99.
o Fix ancient bug in the ELF loader making ldd crash.
o Fix ancient bug in the keyboard code for SGI, SNI and Jazz.
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 230 |
1 files changed, 143 insertions, 87 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 465ee3fdc..482ca262c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.76 1998/03/22 22:10:24 davem Exp $ + * Version: $Id: tcp_output.c,v 1.87 1998/04/26 01:11:35 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -37,6 +37,10 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; /* Get rid of any delayed acks, we sent one already.. */ static __inline__ void clear_delayed_acks(struct sock * sk) @@ -99,25 +103,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* Build TCP header and checksum it. */ th->source = sk->sport; th->dest = sk->dport; - th->seq = htonl(skb->seq); + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tp->rcv_nxt); th->doff = (tcp_header_size >> 2); th->res1 = 0; *(((__u8 *)th) + 13) = tcb->flags; - th->window = htons(tcp_select_window(sk)); + if(!(tcb->flags & TCPCB_FLAG_SYN)) + th->window = htons(tcp_select_window(sk)); th->check = 0; th->urg_ptr = ntohs(tcb->urg_ptr); if(tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ th->window = htons(tp->rcv_wnd); tcp_syn_build_options((__u32 *)(th + 1), sk->mss, sysctl_tcp_timestamps, sysctl_tcp_sack, sysctl_tcp_window_scaling, tp->rcv_wscale, - skb->when); + TCP_SKB_CB(skb)->when); } else { tcp_build_and_update_options((__u32 *)(th + 1), - tp, skb->when); + tp, TCP_SKB_CB(skb)->when); } tp->af_specific->send_check(sk, th, skb->len, skb); @@ -136,13 +144,13 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq += (skb->end_seq - skb->seq); - skb_queue_tail(&sk->write_queue, skb); + tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + __skb_queue_tail(&sk->write_queue, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { /* Send it out now. */ - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); if(!tcp_timer_is_set(sk, TIME_RETRANS)) @@ -171,9 +179,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Get a new skb... force flag on. */ buff = sock_wmalloc(sk, - (nsize + - MAX_HEADER + - sk->prot->max_header + 15), + (nsize + MAX_HEADER + sk->prot->max_header), 1, GFP_ATOMIC); if (buff == NULL) return -1; /* We'll just try again later. */ @@ -182,8 +188,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) skb_reserve(buff, MAX_HEADER + sk->prot->max_header); /* Correct the sequence numbers. */ - buff->seq = skb->seq + len; - buff->end_seq = skb->end_seq; + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; @@ -209,14 +215,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), nsize, 0); - skb->end_seq -= nsize; + TCP_SKB_CB(skb)->end_seq -= nsize; skb_trim(skb, skb->len - nsize); /* Rechecksum original buffer. */ skb->csum = csum_partial(skb->data, skb->len, 0); /* Link BUFF into the send queue. */ - skb_append(skb, buff); + __skb_append(skb, buff); return 0; } @@ -228,18 +234,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) void tcp_write_xmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now = sk->mss; + unsigned int mss_now; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); /* If we are zapped, the bytes will have to remain here. * In time closedown will empty the write queue and all @@ -264,8 +266,8 @@ void tcp_write_xmit(struct sock *sk) /* Advance the send_head. This one is going out. */ update_send_head(sk); - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; @@ -397,7 +399,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m return; /* Ok. We will be able to collapse the packet. */ - skb_unlink(next_skb); + __skb_unlink(next_skb, next_skb->list); if(skb->len % 4) { /* Must copy and rechecksum all data. */ @@ -413,7 +415,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m } /* Update sequence range on original skb. */ - skb->end_seq += next_skb->end_seq - next_skb->seq; + TCP_SKB_CB(skb)->end_seq += + TCP_SKB_CB(next_skb)->end_seq - TCP_SKB_CB(next_skb)->seq; /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ @@ -432,22 +435,28 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m } /* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used to speed up path mtu recovery. Note that - * these simple retransmits aren't counted in the usual tcp retransmit - * backoff counters. + * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); /* Don't muck with the congestion window here. */ tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - - /* FIXME: make the current rtt sample invalid */ tp->retrans_head = NULL; - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + + /* Input control flow will see that this was retransmitted + * and not use it for RTT calculation in the absence of + * the timestamp option. + */ + for (skb = skb_peek(&sk->write_queue); skb != tp->send_head; + skb = skb->next) + if (skb->len > mss) + tcp_retransmit_skb(sk, skb); } static __inline__ void update_retrans_head(struct sock *sk) @@ -467,17 +476,10 @@ static __inline__ void update_retrans_head(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int current_mss = sk->mss; - - /* Account for outgoing SACKS and IP options, if any. */ - if(tp->sack_ok && tp->num_sacks) - current_mss -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - current_mss -= sk->opt->optlen; + unsigned int cur_mss = tcp_current_mss(sk); - if(skb->len > current_mss) { - if(tcp_fragment(sk, skb, current_mss)) + if(skb->len > cur_mss) { + if(tcp_fragment(sk, skb, cur_mss)) return 1; /* We'll try again later. */ /* New SKB created, account for it. */ @@ -486,21 +488,23 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (current_mss >> 1)) && + (skb->len < (cur_mss >> 1)) && (skb->next != tp->send_head) && - (skb->next != (struct sk_buff *)&sk->write_queue)) - tcp_retrans_try_collapse(sk, skb, current_mss); + (skb->next != (struct sk_buff *)&sk->write_queue) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) return 1; /* Routing failure or similar. */ /* Ok, we're gonna send it out, update state. */ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; + tp->retrans_out++; /* Make a copy, if the first transmission SKB clone we made * is still in somebodies hands, else make a clone. */ - skb->when = jiffies; + TCP_SKB_CB(skb)->when = jiffies; if(skb_cloned(skb)) skb = skb_copy(skb, GFP_ATOMIC); else @@ -518,12 +522,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - int ct = 0; if (tp->retrans_head == NULL) tp->retrans_head = skb_peek(&sk->write_queue); @@ -539,19 +545,48 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if(tcp_retransmit_skb(sk, skb)) break; - /* Count retransmissions locally. */ - ct++; - /* Stop retransmitting if we've hit the congestion * window limit. */ - if (ct >= tp->snd_cwnd) + if (tp->retrans_out >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) break; } update_retrans_head(sk); } } +/* Using FACK information, retransmit all missing frames at the receiver + * up to the forward most SACK'd packet (tp->fackets_out) if the packet + * has not been retransmitted already. + */ +void tcp_fack_retransmit(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek(&sk->write_queue); + int packet_cnt = 0; + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS)) + goto next_packet; + + /* Ok, retransmit it. */ + if(tcp_retransmit_skb(sk, skb)) + break; + + if(tcp_packets_in_flight(tp) >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) + break; +next_packet: + packet_cnt++; + if(packet_cnt >= tp->fackets_out) + break; + skb = skb->next; + } +} + /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -559,22 +594,37 @@ void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb = skb_peek_tail(&sk->write_queue); - int mss_now = sk->mss; + unsigned int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); + if((tp->send_head != NULL) && (skb->len < mss_now)) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; - skb->end_seq++; + TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; + + /* Special case to avoid Nagle bogosity. If this + * segment is the last segment, and it was queued + * due to Nagle/SWS-avoidance, send it out now. + */ + if(tp->send_head == skb && + !sk->nonagle && + skb->len < (sk->mss >> 1) && + tp->packets_out && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { + update_send_head(sk); + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } } else { /* Socket is locked, keep trying until memory is available. */ do { @@ -592,8 +642,8 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ - skb->seq = tp->write_seq; - skb->end_seq = skb->seq + 1; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; tcp_send_skb(sk, skb, 0); } } @@ -621,9 +671,9 @@ void tcp_send_active_reset(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* Send it off. */ - skb->seq = tp->write_seq; - skb->end_seq = skb->seq; - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = jiffies; tcp_transmit_skb(sk, skb); } @@ -650,15 +700,18 @@ int tcp_send_synack(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* SYN eats a sequence byte. */ - skb->seq = tp->snd_una; - skb->end_seq = skb->seq + 1; - skb_queue_tail(&sk->write_queue, skb); - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->snd_una; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + __skb_queue_tail(&sk->write_queue, skb); + TCP_SKB_CB(skb)->when = jiffies; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); return 0; } +/* + * Prepare a SYN-ACK. + */ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req, int mss) { @@ -705,9 +758,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack = 1; th->source = sk->sport; th->dest = req->rmt_port; - skb->seq = req->snt_isn; - skb->end_seq = skb->seq + 1; - th->seq = htonl(skb->seq); + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; @@ -720,16 +773,18 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, &rcv_wscale); req->rcv_wscale = rcv_wscale; } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(req->rcv_wnd); - skb->when = jiffies; + TCP_SKB_CB(skb)->when = jiffies; tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, req->sack_ok, req->wscale_ok, req->rcv_wscale, - skb->when); + TCP_SKB_CB(skb)->when); skb->csum = 0; th->doff = (tcp_header_size >> 2); - tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutSegs++; return skb; } @@ -774,9 +829,9 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) TCP_SKB_CB(buff)->sacked = 0; TCP_SKB_CB(buff)->urg_ptr = 0; buff->csum = 0; - buff->seq = tp->write_seq++; - buff->end_seq = tp->write_seq; - tp->snd_nxt = buff->end_seq; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; tp->window_clamp = dst->window; tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, @@ -784,7 +839,6 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) &tp->window_clamp, sysctl_tcp_window_scaling, &tp->rcv_wscale); - /* Ok, now lock the socket before we make it visible to * the incoming packet engine. */ @@ -800,10 +854,12 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) tp->rto = dst->rtt; tcp_init_xmit_timers(sk); tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; /* Send it off. */ - skb_queue_tail(&sk->write_queue, buff); - buff->when = jiffies; + __skb_queue_tail(&sk->write_queue, buff); + TCP_SKB_CB(buff)->when = jiffies; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); tcp_statistics.TcpActiveOpens++; @@ -870,8 +926,8 @@ void tcp_send_ack(struct sock *sk) TCP_SKB_CB(buff)->urg_ptr = 0; /* Send it off, this clears delayed acks for us. */ - buff->seq = buff->end_seq = tp->snd_nxt; - buff->when = jiffies; + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt; + TCP_SKB_CB(buff)->when = jiffies; tcp_transmit_skb(sk, buff); } } @@ -904,13 +960,13 @@ void tcp_write_wakeup(struct sock *sk) * must have been a result SWS avoidance ( sender ) */ win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < skb->end_seq - skb->seq) { + if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { if (tcp_fragment(sk, skb, win_size)) return; /* Let a retransmit get it. */ } update_send_head(sk); - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!tcp_timer_is_set(sk, TIME_RETRANS)) @@ -933,9 +989,9 @@ void tcp_write_wakeup(struct sock *sk) * end to send an ack. Don't queue or clone SKB, just * send it. */ - skb->seq = tp->snd_nxt - 1; - skb->end_seq = skb->seq; - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = jiffies; tcp_transmit_skb(sk, skb); } } |