summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-08-28 22:00:09 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-08-28 22:00:09 +0000
commit1a1d77dd589de5a567fa95e36aa6999c704ceca4 (patch)
tree141e31f89f18b9fe0831f31852e0435ceaccafc5 /net/ipv4/tcp_output.c
parentfb9c690a18b3d66925a65b17441c37fa14d4370b (diff)
Merge with 2.4.0-test7.
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c657
1 files changed, 379 insertions, 278 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0fdb6b3f8..8dca4474b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.124 2000/04/08 07:21:24 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.127 2000/08/15 20:15:23 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -32,6 +32,7 @@
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
* Cacophonix Gaul : draft-minshall-nagle-01
+ * J Hadi Salim : ECN support
*
*/
@@ -42,13 +43,29 @@
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
-static __inline__ void update_send_head(struct sock *sk)
+static __inline__
+void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- tp->send_head = tp->send_head->next;
+ tp->send_head = skb->next;
if (tp->send_head == (struct sk_buff *) &sk->write_queue)
tp->send_head = NULL;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tp->packets_out++ == 0)
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
+{
+ if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+ return tp->snd_nxt;
+ else
+ return tp->snd_una+tp->snd_wnd;
}
/* Calculate mss to advertise in SYN segment.
@@ -79,15 +96,38 @@ static __u16 tcp_advertise_mss(struct sock *sk)
return (__u16)mss;
}
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct tcp_opt *tp)
+{
+ s32 delta = tcp_time_stamp - tp->lsndtime;
+ u32 restart_cwnd = tcp_init_cwnd(tp);
+ u32 cwnd = tp->snd_cwnd;
+
+ tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ restart_cwnd = min(restart_cwnd, cwnd);
+
+ while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+ cwnd >>= 1;
+ tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_used = 0;
+}
+
static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
{
- /* If we had a reply for ato after last received
+ u32 now = tcp_time_stamp;
+
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+ tcp_cwnd_restart(tp);
+
+ tp->lsndtime = now;
+
+ /* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato)
+ if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
tp->ack.pingpong = 1;
-
- tp->lsndtime = tcp_time_stamp;
}
static __inline__ void tcp_event_ack_sent(struct sock *sk)
@@ -95,11 +135,56 @@ static __inline__ void tcp_event_ack_sent(struct sock *sk)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
tcp_dec_quickack_mode(tp);
- tp->ack.pending = 0;
- tp->ack.rcv_segs = 0;
tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
}
+/* Chose a new window to advertise, update state in tcp_opt for the
+ * socket, and return result with RFC1323 scaling applied. The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static __inline__ u16 tcp_select_window(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ u32 cur_win = tcp_receive_window(tp);
+ u32 new_win = __tcp_select_window(sk);
+
+ /* Never shrink the offered window */
+ if(new_win < cur_win) {
+ /* Danger Will Robinson!
+ * Don't update rcv_wup/rcv_wnd here or else
+ * we will not be able to advertise a zero
+ * window in time. --DaveM
+ *
+ * Relax Will Robinson.
+ */
+ new_win = cur_win;
+ }
+ tp->rcv_wnd = new_win;
+ tp->rcv_wup = tp->rcv_nxt;
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rcv_wscale;
+
+#ifdef TCP_FORMAL_WINDOW
+ if (new_win == 0) {
+ /* If we advertise zero window, disable fast path. */
+ tp->pred_flags = 0;
+ } else if (cur_win == 0 && tp->pred_flags == 0 &&
+ skb_queue_len(&tp->out_of_order_queue) == 0 &&
+ !tp->urg_data) {
+ /* If we open zero window, enable fast path.
+ Without this it will be open by the first data packet,
+ it is too late to merge checksumming to copy.
+ */
+ tcp_fast_path_on(tp);
+ }
+#endif
+
+ return new_win;
+}
+
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -141,12 +226,12 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
- } else if (tp->num_sacks) {
+ } else if (tp->eff_sacks) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
*/
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
}
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
@@ -155,7 +240,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
/* Build TCP header and checksum it. */
th->source = sk->sport;
th->dest = sk->dport;
- th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -176,11 +261,13 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rcv_wscale,
- TCP_SKB_CB(skb)->when,
+ tcb->when,
tp->ts_recent);
} else {
tcp_build_and_update_options((__u32 *)(th + 1),
- tp, TCP_SKB_CB(skb)->when);
+ tp, tcb->when);
+
+ TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
tp->af_specific->send_check(sk, th, skb->len, skb);
@@ -196,7 +283,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
if (err <= 0)
return err;
- tcp_enter_cong_avoid(tp);
+ tcp_enter_cwr(tp);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
@@ -212,6 +299,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
#undef SYSCTL_FLAG_SACK
}
+
/* This is the main buffer sending routine. We queue the buffer
* and decide whether to queue or transmit now.
*
@@ -225,15 +313,15 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigne
/* Advance write_seq and place onto the write_queue. */
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
__skb_queue_tail(&sk->write_queue, skb);
+ tcp_charge_skb(sk, skb);
if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) {
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) {
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_minshall_update(tp, cur_mss, skb->len);
- tp->packets_out++;
- if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_minshall_update(tp, cur_mss, skb);
+ if (tp->packets_out++ == 0)
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
return;
}
@@ -250,16 +338,16 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigne
*/
static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct sk_buff *buff;
int nsize = skb->len - len;
u16 flags;
/* Get a new skb... force flag on. */
- buff = sock_wmalloc(sk,
- (nsize + MAX_TCP_HEADER + 15),
- 1, GFP_ATOMIC);
+ buff = tcp_alloc_skb(sk, nsize + MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
+ tcp_charge_skb(sk, buff);
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
@@ -286,7 +374,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
if(!(flags & TCPCB_FLAG_URG))
TCP_SKB_CB(buff)->urg_ptr = 0;
TCP_SKB_CB(buff)->flags = flags;
- TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS);
+ if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
+ tp->lost_out++;
+ tp->left_out++;
+ }
/* Copy and checksum data tail into the new buffer. */
buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
@@ -301,11 +393,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
/* Looks stupid, but our code really uses when of
* skbs, which it never sent before. --ANK
- *
- * NOTE: several days after I added this, Dave repaired
- * tcp_simple_retransmit() and it should not use ->when
- * of never sent skbs more. I am not sure, so that
- * this line remains until more careful investigation. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
@@ -401,13 +488,6 @@ int tcp_write_xmit(struct sock *sk)
*/
mss_now = tcp_current_mss(sk);
- /* Anything on the transmit queue that fits the window can
- * be added providing we are:
- *
- * a) following SWS avoidance [and Nagle algorithm]
- * b) not exceeding our congestion window.
- * c) not retransmitting [Nagle]
- */
while((skb = tp->send_head) &&
tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) {
if (skb->len > mss_now) {
@@ -419,19 +499,13 @@ int tcp_write_xmit(struct sock *sk)
if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
break;
/* Advance the send_head. This one is sent out. */
- update_send_head(sk);
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_minshall_update(tp, mss_now, skb->len);
- tp->packets_out++;
+ update_send_head(sk, tp, skb);
+ tcp_minshall_update(tp, mss_now, skb);
sent_pkts = 1;
}
- /* If we sent anything, make sure the retransmit
- * timer is active.
- */
if (sent_pkts) {
- if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ tcp_cwnd_validate(sk, tp);
return 0;
}
@@ -506,28 +580,22 @@ u32 __tcp_select_window(struct sock *sk)
/* Sometimes free_space can be < 0. */
free_space = tcp_space(sk);
- if (free_space > ((int) tp->window_clamp))
- free_space = tp->window_clamp;
if (tp->window_clamp < mss)
mss = tp->window_clamp;
- if (free_space < min((int)tp->window_clamp, tcp_full_space(sk)) / 2) {
- /* THIS IS _VERY_ GOOD PLACE to play window clamp.
- * if free_space becomes suspiciously low
- * verify ratio rmem_alloc/(rcv_nxt - copied_seq),
- * and if we predict that when free_space will be lower mss,
- * rmem_alloc will run out of rcvbuf*2, shrink window_clamp.
- * It will eliminate most of prune events! Very simple,
- * it is the next thing to do. --ANK
- *
- * Provided we found a way to raise it back... --ANK
- */
+ if (free_space < (int)min(tp->window_clamp, tcp_full_space(sk)) / 2) {
tp->ack.quick = 0;
+ if (tcp_memory_pressure)
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss);
+
if (free_space < ((int) (mss/2)))
return 0;
}
+ if (free_space > tp->rcv_ssthresh)
+ free_space = tp->rcv_ssthresh;
+
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
@@ -547,6 +615,7 @@ u32 __tcp_select_window(struct sock *sk)
/* Attempt to collapse two adjacent SKB's during retransmission. */
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct sk_buff *next_skb = skb->next;
/* The first test we must make is that neither of these two
@@ -564,6 +633,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
return;
+ /* Next skb is out of window. */
+ if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
+ return;
+
/* Punt if not enough space exists in the first SKB for
* the data in the second, or the total combined payload
* would exceed the MSS.
@@ -602,8 +675,20 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
*/
- kfree_skb(next_skb);
- sk->tp_pinfo.af_tcp.packets_out--;
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&TCPCB_EVER_RETRANS;
+ if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
+ tp->retrans_out--;
+ if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
+ tp->lost_out--;
+ tp->left_out--;
+ }
+ if (!tp->sack_ok && tp->sacked_out) {
+ /* Reno case is special. Sigh... */
+ tp->sacked_out--;
+ tp->left_out--;
+ }
+ tcp_free_skb(sk, next_skb);
+ tp->packets_out--;
}
}
@@ -614,53 +699,43 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
void tcp_simple_retransmit(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct sk_buff *skb, *old_next_skb;
+ struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk);
-
- /* Don't muck with the congestion window here. */
- tp->dup_acks = 0;
- tp->high_seq = tp->snd_nxt;
- tp->retrans_head = NULL;
-
- /* Input control flow will see that this was retransmitted
- * and not use it for RTT calculation in the absence of
- * the timestamp option.
- */
- for (old_next_skb = skb = skb_peek(&sk->write_queue);
- ((skb != tp->send_head) &&
- (skb != (struct sk_buff *)&sk->write_queue));
- skb = skb->next) {
- int resend_skb = 0;
-
- /* Our goal is to push out the packets which we
- * sent already, but are being chopped up now to
- * account for the PMTU information we have.
- *
- * As we resend the queue, packets are fragmented
- * into two pieces, and when we try to send the
- * second piece it may be collapsed together with
- * a subsequent packet, and so on. -DaveM
- */
- if (old_next_skb != skb || skb->len > mss)
- resend_skb = 1;
- old_next_skb = skb->next;
- if (resend_skb != 0) {
- if (tcp_retransmit_skb(sk, skb))
- break;
+ int lost = 0;
+
+ for_retrans_queue(skb, sk, tp) {
+ if (skb->len > mss &&
+ !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out--;
+ }
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out++;
+ lost = 1;
+ }
}
}
-}
-static __inline__ void update_retrans_head(struct sock *sk)
-{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- tp->retrans_head = tp->retrans_head->next;
- if((tp->retrans_head == tp->send_head) ||
- (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
- tp->retrans_head = NULL;
- tp->rexmt_done = 1;
+ if (!lost)
+ return;
+
+ tp->left_out = tp->sacked_out + tp->lost_out;
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ if (tp->ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tp->ca_state = TCP_CA_Loss;
}
+ tcp_xmit_retransmit_queue(sk);
}
/* This retransmits one SKB. Policy decisions and retransmit queue
@@ -671,18 +746,13 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int cur_mss = tcp_current_mss(sk);
+ int err;
-#ifdef TCP_DEBUG
- /* It was possible this summer, that retransmit timer
- * raced with its deletion and hit socket with packets_out==0.
- * I fixed it, but preserved the check in the place,
- * where the fault occured. --ANK
+ /* Do not sent more than we queued. 1/4 is reserved for possible
+ * copying overhead: frgagmentation, tunneling, mangling etc.
*/
- if (skb == NULL) {
- printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk));
- return -EFAULT;
- }
-#endif
+ if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
+ return -EAGAIN;
if(skb->len > cur_mss) {
if(tcp_fragment(sk, skb, cur_mss))
@@ -715,23 +785,40 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
skb->csum = 0;
}
- /* Ok, we're gonna send it out, update state. */
- TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
- tp->retrans_out++;
-
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if(skb_cloned(skb))
- skb = skb_copy(skb, GFP_ATOMIC);
- else
- skb = skb_clone(skb, GFP_ATOMIC);
- /* Update global TCP statistics and return success. */
- TCP_INC_STATS(TcpRetransSegs);
+ err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
+ skb_copy(skb, GFP_ATOMIC):
+ skb_clone(skb, GFP_ATOMIC)));
- return tcp_transmit_skb(sk, skb);
+ if (err == 0) {
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(TcpRetransSegs);
+
+#if FASTRETRANS_DEBUG > 0
+ if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "retrans_out leaked.\n");
+ }
+#endif
+ TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+ tp->retrans_out++;
+
+ /* Save stamp of the first retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+
+ tp->undo_retrans++;
+
+ /* snd_nxt is stored to detect loss of retransmitted segment,
+ * see tcp_input.c tcp_sacktag_write_queue().
+ */
+ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ }
+ return err;
}
/* This gets called after a retransmit timeout, and the initially
@@ -746,71 +833,79 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
+ int packet_cnt = tp->lost_out;
+
+ /* First pass: retransmit lost packets. */
+ if (packet_cnt) {
+ for_retrans_queue(skb, sk, tp) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return;
+
+ if (sacked&TCPCB_LOST) {
+ if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
+ if (tcp_retransmit_skb(sk, skb))
+ return;
+ if (tp->ca_state != TCP_CA_Loss)
+ NET_INC_STATS_BH(TCPFastRetrans);
+ else
+ NET_INC_STATS_BH(TCPSlowStartRetrans);
+
+ if (skb == skb_peek(&sk->write_queue))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ }
+
+ if (--packet_cnt <= 0)
+ break;
+ }
+ }
+ }
- if (tp->retrans_head == NULL &&
- tp->rexmt_done == 0)
- tp->retrans_head = skb_peek(&sk->write_queue);
- if (tp->retrans_head == tp->send_head)
- tp->retrans_head = NULL;
+ /* OK, demanded retransmission is finished. */
- /* Each time, advance the retrans_head if we got
- * a packet out or we skipped one because it was
- * SACK'd. -DaveM
+ /* Forward retransmissions are possible only during Recovery. */
+ if (tp->ca_state != TCP_CA_Recovery)
+ return;
+
+ /* No forward retransmissions in Reno are possible. */
+ if (!tp->sack_ok)
+ return;
+
+ /* Yeah, we have to make difficult choice between forward transmission
+ * and retransmission... Both ways have their merits...
+ *
+ * For now we do not retrnamsit anything, while we have some new
+ * segments to send.
*/
- while ((skb = tp->retrans_head) != NULL) {
- /* If it has been ack'd by a SACK block, we don't
- * retransmit it.
- */
- if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- /* Send it out, punt if error occurred. */
- if(tcp_retransmit_skb(sk, skb))
- break;
- update_retrans_head(sk);
-
- /* Stop retransmitting if we've hit the congestion
- * window limit.
- */
- if (tp->retrans_out >= tp->snd_cwnd)
- break;
- } else {
- update_retrans_head(sk);
- }
- }
-}
+ if (tcp_may_send_now(sk, tp))
+ return;
-/* Using FACK information, retransmit all missing frames at the receiver
- * up to the forward most SACK'd packet (tp->fackets_out) if the packet
- * has not been retransmitted already.
- */
-void tcp_fack_retransmit(struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct sk_buff *skb = skb_peek(&sk->write_queue);
- int packet_cnt = 0;
+ packet_cnt = 0;
+
+ for_retrans_queue(skb, sk, tp) {
+ if(++packet_cnt > tp->fackets_out)
+ break;
- while((skb != NULL) &&
- (skb != tp->send_head) &&
- (skb != (struct sk_buff *)&sk->write_queue)) {
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ break;
- if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
- goto next_packet;
+ if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+ continue;
/* Ok, retransmit it. */
if(tcp_retransmit_skb(sk, skb))
break;
- if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
- break;
-next_packet:
- packet_cnt++;
- if(packet_cnt >= tp->fackets_out)
- break;
- skb = skb->next;
+ if (skb == skb_peek(&sk->write_queue))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+
+ NET_INC_STATS_BH(TCPForwardRetrans);
}
}
+
/* Send a fin. The caller locks the socket for us. This cannot be
* allowed to fail queueing a FIN frame under any circumstances.
*/
@@ -839,30 +934,19 @@ void tcp_send_fin(struct sock *sk)
/* Special case to avoid Nagle bogosity. If this
* segment is the last segment, and it was queued
* due to Nagle/SWS-avoidance, send it out now.
- *
- * Hmm... actually it overrides also congestion
- * avoidance (OK for FIN) and retransmit phase
- * (not OK? Added.).
*/
if(tp->send_head == skb &&
- !after(tp->write_seq, tp->snd_una + tp->snd_wnd) &&
- !tp->retransmits) {
+ !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) {
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) {
- update_send_head(sk);
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
- } else
+ if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)))
+ update_send_head(sk, tp, skb);
+ else
tcp_check_probe_timer(sk, tp);
}
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
- skb = sock_wmalloc(sk,
- MAX_TCP_HEADER + 15,
- 1, GFP_KERNEL);
+ skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
if (skb)
break;
current->policy |= SCHED_YIELD;
@@ -896,8 +980,10 @@ void tcp_send_active_reset(struct sock *sk, int priority)
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER + 15, priority);
- if (!skb)
+ if (!skb) {
+ NET_INC_STATS(TCPAbortFailed);
return;
+ }
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
@@ -907,10 +993,11 @@ void tcp_send_active_reset(struct sock *sk, int priority)
TCP_SKB_CB(skb)->urg_ptr = 0;
/* Send it off. */
- TCP_SKB_CB(skb)->seq = tp->snd_nxt;
+ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, skb);
+ if (tcp_transmit_skb(sk, skb))
+ NET_INC_STATS(TCPAbortFailed);
}
/* WARNING: This routine must only be called when we have already sent
@@ -920,27 +1007,29 @@ void tcp_send_active_reset(struct sock *sk, int priority)
*/
int tcp_send_synack(struct sock *sk)
{
- struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
- struct sk_buff* skb;
+ struct sk_buff* skb;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15,
- 1, GFP_ATOMIC);
- if (skb == NULL)
- return -ENOMEM;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
- TCP_SKB_CB(skb)->sacked = 0;
- TCP_SKB_CB(skb)->urg_ptr = 0;
+ skb = skb_peek(&sk->write_queue);
+ if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
+ printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ return -EFAULT;
+ }
+ if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+ if (skb_cloned(skb)) {
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ return -ENOMEM;
+ __skb_unlink(skb, &sk->write_queue);
+ __skb_queue_head(&sk->write_queue, nskb);
+ tcp_free_skb(sk, skb);
+ tcp_charge_skb(sk, nskb);
+ skb = nskb;
+ }
- /* SYN eats a sequence byte. */
- TCP_SKB_CB(skb)->seq = tp->snd_una;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- __skb_queue_tail(&sk->write_queue, skb);
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+ TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
+ }
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->packets_out++;
return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
}
@@ -974,6 +1063,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
+ TCP_ECN_make_synack(req, th);
th->source = sk->sport;
th->dest = req->rmt_port;
TCP_SKB_CB(skb)->seq = req->snt_isn;
@@ -983,7 +1073,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : skb->dst->window;
+ req->window_clamp = tp->window_clamp ? : dst->window;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -1028,18 +1118,20 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->mss_clamp = tp->user_mss;
tp->max_window = 0;
tcp_sync_mss(sk, dst->pmtu);
- tcp_initialize_rcv_mss(sk);
if (!tp->window_clamp)
tp->window_clamp = dst->window;
tp->advmss = dst->advmss;
+ tcp_initialize_rcv_mss(sk);
tcp_select_initial_window(tcp_full_space(sk),
- tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
- &tp->rcv_wnd,
- &tp->window_clamp,
- sysctl_tcp_window_scaling,
- &tp->rcv_wscale);
+ tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ &tp->rcv_wnd,
+ &tp->window_clamp,
+ sysctl_tcp_window_scaling,
+ &tp->rcv_wscale);
+
+ tp->rcv_ssthresh = tp->rcv_wnd;
/* Socket identity change complete, no longer
* in TCP_CLOSE, so enter ourselves into the
@@ -1052,8 +1144,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
sk->err = 0;
sk->done = 0;
tp->snd_wnd = 0;
- tp->snd_wl1 = 0;
- tp->snd_wl2 = tp->write_seq;
+ tcp_init_wl(tp, tp->write_seq, 0);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->rcv_nxt = 0;
@@ -1061,23 +1152,24 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->copied_seq = 0;
tp->rto = TCP_TIMEOUT_INIT;
- tcp_init_xmit_timers(sk);
tp->retransmits = 0;
- tp->fackets_out = 0;
- tp->retrans_out = 0;
+ tcp_clear_retrans(tp);
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+ TCP_ECN_send_syn(tp, buff);
TCP_SKB_CB(buff)->sacked = 0;
TCP_SKB_CB(buff)->urg_ptr = 0;
buff->csum = 0;
TCP_SKB_CB(buff)->seq = tp->write_seq++;
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
tp->snd_nxt = tp->write_seq;
+ tp->pushed_seq = tp->write_seq;
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tp->syn_stamp = TCP_SKB_CB(buff)->when;
+ tp->retrans_stamp = TCP_SKB_CB(buff)->when;
__skb_queue_tail(&sk->write_queue, buff);
+ tcp_charge_skb(sk, buff);
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
TCP_INC_STATS(TcpActiveOpens);
@@ -1099,20 +1191,27 @@ err_out:
void tcp_send_delayed_ack(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- long ato = tp->ack.ato;
+ int ato = tp->ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
- int max_ato;
+ int max_ato = HZ/2;
+
+ if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+ max_ato = TCP_DELACK_MAX;
+
+ /* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
* Do not use tp->rto here, use results of rtt measurements
* directly.
*/
- if (tp->srtt)
- max_ato = (tp->srtt >> 3) + tp->mdev;
- else
- max_ato = TCP_DELACK_MAX;
+ if (tp->srtt) {
+ int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+
+ if (rtt < max_ato)
+ max_ato = rtt;
+ }
ato = min(ato, max_ato);
}
@@ -1121,20 +1220,20 @@ void tcp_send_delayed_ack(struct sock *sk)
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
- if (timer_pending(&tp->delack_timer)) {
- unsigned long old_timeout = tp->delack_timer.expires;
-
+ if (tp->ack.pending&TCP_ACK_TIMER) {
/* If delack timer was blocked or is about to expire,
* send ACK now.
*/
- if (tp->ack.blocked || time_before_eq(old_timeout, jiffies+(ato>>2))) {
+ if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, old_timeout))
- timeout = old_timeout;
+ if (!time_before(timeout, tp->ack.timeout))
+ timeout = tp->ack.timeout;
}
+ tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
+ tp->ack.timeout = timeout;
if (!mod_timer(&tp->delack_timer, timeout))
sock_hold(sk);
@@ -1170,8 +1269,8 @@ void tcp_send_ack(struct sock *sk)
*/
buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (buff == NULL) {
- tp->ack.pending = 1;
- tp->ack.ato = TCP_ATO_MAX;
+ tcp_schedule_ack(tp);
+ tp->ack.ato = TCP_ATO_MIN;
tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
return;
}
@@ -1184,7 +1283,7 @@ void tcp_send_ack(struct sock *sk)
TCP_SKB_CB(buff)->urg_ptr = 0;
/* Send it off, this clears delayed acks for us. */
- TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_transmit_skb(sk, buff);
}
@@ -1193,66 +1292,68 @@ void tcp_send_ack(struct sock *sk)
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
*/
+static int tcp_xmit_probe_skb(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+
+ /* We don't queue it, tcp_transmit_skb() sets ownership. */
+ skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ return -1;
+
+ /* Reserve space for headers and set control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* Use a previous sequence. This should cause the other
+ * end to send an ack. Don't queue or clone SKB, just
+ * send it.
+ */
+ TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ return tcp_transmit_skb(sk, skb);
+}
+
int tcp_write_wakeup(struct sock *sk)
{
if (sk->state != TCP_CLOSE) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
- /* Now this function is never called, while
- * we have something not ACKed in queue.
- */
- BUG_TRAP(tp->snd_una == tp->snd_nxt);
-
- if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una)
- && ((skb = tp->send_head) != NULL)) {
+ if ((skb = tp->send_head) != NULL &&
+ before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
int err;
- unsigned long win_size;
+ int mss = tcp_current_mss(sk);
+ int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
/* We are probing the opening of a window
* but the window size is != 0
* must have been a result SWS avoidance ( sender )
*/
- win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
- if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
- if (tcp_fragment(sk, skb, win_size))
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ if (tcp_fragment(sk, skb, seg_size))
return -1;
}
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
if (!err) {
- update_send_head(sk);
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ update_send_head(sk, tp, skb);
}
return err;
} else {
- /* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
- if (skb == NULL)
- return -1;
-
- /* Reserve space for headers and set control bits. */
- skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(skb)->sacked = 0;
- TCP_SKB_CB(skb)->urg_ptr = 0;
-
- /* Use a previous sequence. This should cause the other
- * end to send an ack. Don't queue or clone SKB, just
- * send it.
- *
- * RED-PEN: logically it should be snd_una-1.
- * snd_nxt-1 will not be acked. snd_una==snd_nxt
- * in this place however. Right?
- */
- TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb);
+ return tcp_xmit_probe_skb(sk);
}
}
return -1;