summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1998-05-07 02:55:41 +0000
committerRalf Baechle <ralf@linux-mips.org>1998-05-07 02:55:41 +0000
commitdcec8a13bf565e47942a1751a9cec21bec5648fe (patch)
tree548b69625b18cc2e88c3e68d0923be546c9ebb03 /net/ipv4/tcp_input.c
parent2e0f55e79c49509b7ff70ff1a10e1e9e90a3dfd4 (diff)
o Merge with Linux 2.1.99.
o Fix ancient bug in the ELF loader making ldd crash. o Fix ancient bug in the keyboard code for SGI, SNI and Jazz.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c701
1 files changed, 363 insertions, 338 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1c34e6693..d5b0b15c6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.114 1998/04/28 06:42:22 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -42,6 +42,14 @@
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
+ * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * timestamps.
+ * Andrey Savochkin: Check sequence numbers correctly when
+ * removing SACKs due to in sequence incoming
+ * data segments.
+ * Andi Kleen: Make sure we never ack data there is not
+ * enough room for. Also make this condition
+ * a fatal error if it might still happen.
*/
#include <linux/config.h>
@@ -50,15 +58,6 @@
#include <net/tcp.h>
#include <linux/ipsec.h>
-typedef void (*tcp_sys_cong_ctl_t)(struct sock *sk,
- u32 seq, u32 ack,
- u32 seq_rtt);
-
-static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack,
- u32 seq_rtt);
-static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
- u32 seq_rtt);
-
#ifdef CONFIG_SYSCTL
#define SYNC_INIT 0 /* let the user enable it */
#else
@@ -80,7 +79,7 @@ int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;
-static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
+static int prune_queue(struct sock *sk);
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
@@ -164,7 +163,7 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
tp->rto = (tp->srtt >> 3) + tp->mdev;
- tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
+ tp->rto += (tp->rto >> 2) + (tp->rto >> ((tp->snd_cwnd>>TCP_CWND_SHIFT)-1));
}
@@ -176,7 +175,7 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp)
* way to avoid the problem. Is it possible to drop the lower
* bound and still avoid trouble with BSD stacks? Perhaps
* some modification to the RTO calculation that takes delayed
- * ack bais into account? This needs serious thought. -- erics
+ * ack bias into account? This needs serious thought. -- erics
*/
static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
{
@@ -193,19 +192,27 @@ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
* test is last_ack_sent <= end_seq.
* (RFC1323 stated last_ack_sent < end_seq.)
*/
- if (!before(end_seq,tp->last_ack_sent)) {
- tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = jiffies;
+ if (!before(end_seq, tp->last_ack_sent)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ */
+ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = jiffies;
+ }
}
}
#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len)
{
/* ts_recent must be younger than 24 days */
return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
- ((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
+ (((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
+ /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
+ (len != (th->doff * 4))));
}
@@ -266,15 +273,34 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp,
struct sk_buff *skb = skb_peek(&sk->write_queue);
__u32 start_seq = ntohl(sp->start_seq);
__u32 end_seq = ntohl(sp->end_seq);
+ int fack_count = 0;
while((skb != NULL) &&
(skb != tp->send_head) &&
(skb != (struct sk_buff *)&sk->write_queue)) {
+ /* The retransmission queue is always in order, so
+ * we can short-circuit the walk early.
+ */
+ if(!before(start_seq, TCP_SKB_CB(skb)->end_seq))
+ break;
+
/* We play conservative, we don't allow SACKS to partially
* tag a sequence space.
*/
- if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq))
+ fack_count++;
+ if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* If this was a retransmitted frame, account for it. */
+ if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out--;
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+
+ /* RULE: All new SACKs will either decrease retrans_out
+ * or advance fackets_out.
+ */
+ if(fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+ }
skb = skb->next;
}
sp++; /* Move on to the next SACK block. */
@@ -322,6 +348,13 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
if (!no_fancy && sysctl_tcp_window_scaling) {
tp->wscale_ok = 1;
tp->snd_wscale = *(__u8 *)ptr;
+ if(tp->snd_wscale > 14) {
+ if(net_ratelimit())
+ printk("tcp_parse_options: Illegal window "
+ "scaling value %d >14 received.",
+ tp->snd_wscale);
+ tp->snd_wscale = 14;
+ }
}
break;
case TCPOPT_TIMESTAMP:
@@ -388,19 +421,43 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th,
return 1;
}
+#if 0 /* Not working yet... -DaveM */
+static void tcp_compute_tsack(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb = skb_peek(&sk->write_queue);
+ __u32 tstamp = tp->rcv_tsecr;
+ int fack_count = 0;
+
+ while((skb != NULL) &&
+ (skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue)) {
+ if(TCP_SKB_CB(skb)->when == tstamp) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ sacked |= TCPCB_SACKED_ACKED;
+ if(sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out--;
+ TCP_SKB_CB(skb)->sacked = sacked;
+ }
+ if(!before(TCP_SKB_CB(skb)->when, tstamp))
+ fack_count++;
+ skb = skb->next;
+ }
+ if(fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+}
+#endif
+
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
-static __inline__ void clear_fast_retransmit(struct sock *sk)
+static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ if (tp->dup_acks > 3)
+ tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT);
- if (tp->dup_acks > 3) {
- tp->retrans_head = NULL;
- tp->snd_cwnd = max(tp->snd_ssthresh, 1);
- }
tp->dup_acks = 0;
}
@@ -409,10 +466,9 @@ static __inline__ void clear_fast_retransmit(struct sock *sk)
*/
static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * Note: If not_dup is set this implies we got a
+ /* Note: If not_dup is set this implies we got a
* data carrying packet or a window update.
* This carries no new information about possible
* lost packets, so we have to ignore it for the purposes
@@ -422,22 +478,31 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
* the code below much more complex. For now if I see such
* a packet I clear the fast retransmit phase.
*/
-
if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
/* This is the standard reno style fast retransmit branch. */
+#if 0 /* Not working yet... -DaveM */
+ /* If not doing SACK, but doing timestamps, compute timestamp
+ * based pseudo-SACKs when we see duplicate ACKs.
+ */
+ if(!tp->sack_ok && tp->saw_tstamp)
+ tcp_compute_tsack(sk, tp);
+#endif
/* 1. When the third duplicate ack is received, set ssthresh
* to one half the current congestion window, but no less
* than two segments. Retransmit the missing segment.
*/
if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
tp->dup_acks++;
- if (tp->dup_acks == 3) {
+ if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
tp->dup_acks++;
- tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
- tp->snd_cwnd = tp->snd_ssthresh + 3;
+ tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2);
+ tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT;
tp->high_seq = tp->snd_nxt;
- tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+ if(!tp->fackets_out)
+ tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+ else
+ tcp_fack_retransmit(sk);
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
}
}
@@ -446,10 +511,22 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
* cwnd by the segment size. [...] Transmit a packet...
*
* Packet transmission will be done on normal flow processing
- * since we're not in "retransmit mode"
+ * since we're not in "retransmit mode". We do not use duplicate
+ * ACKs to artificially inflate the congestion window when
+ * doing FACK.
*/
- if (tp->dup_acks > 3)
- tp->snd_cwnd++;
+ if (tp->dup_acks > 3) {
+ if(!tp->fackets_out) {
+ tp->snd_cwnd += (1 << TCP_CWND_SHIFT);
+ } else {
+ /* Fill any further holes which may have appeared.
+ * We may want to change this to run every further
+ * multiple-of-3 dup ack increments, to be more robust
+ * against out-of-order packet delivery. -DaveM
+ */
+ tcp_fack_retransmit(sk);
+ }
+ }
} else if (tp->high_seq != 0) {
/* In this branch we deal with clearing the Floyd style
* block on duplicate fast retransmits, and if requested
@@ -463,15 +540,17 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
* Note that we do want to accept a window
* update since this is expected with Hoe's algorithm.
*/
- clear_fast_retransmit(sk);
+ clear_fast_retransmit(tp);
/* After we have cleared up to high_seq we can
* clear the Floyd style block.
*/
- if (after(ack, tp->high_seq))
+ if (!before(ack, tp->high_seq)) {
tp->high_seq = 0;
+ tp->fackets_out = 0;
+ }
} else if (tp->dup_acks >= 3) {
- if (sysctl_tcp_hoe_retransmits) {
+ if (!tp->fackets_out) {
/* Hoe Style. We didn't ack the whole
* window. Take this as a cue that
* another packet was lost and retransmit it.
@@ -490,131 +569,34 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
}
} else {
- /* Reno style. We didn't ack the whole
- * window, now we have to drop out of
- * fast retransmit and wait for a timeout.
+ /* FACK style, fill any remaining holes in
+ * receiver's queue.
*/
- clear_fast_retransmit(sk);
+ tcp_fack_retransmit(sk);
}
}
}
}
-/*
- * TCP slow start and congestion avoidance in two flavors:
- * RFC 1122 and TCP Vegas.
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
*
- * This is a /proc/sys configurable option.
+ * FIXME: What happens when the congestion window gets larger
+ * than the maximum receiver window by some large factor
+ * Suppose the pipeline never looses packets for a long
+ * period of time, then traffic increases causing packet loss.
+ * The congestion window should be reduced, but what it should
+ * be reduced to is not clear, since 1/2 the old window may
+ * still be larger than the maximum sending rate we ever achieved.
*/
-
-#define SHIFT_FACTOR 16
-
-static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
- u32 seq_rtt)
+static void tcp_cong_avoid(struct tcp_opt *tp, u32 seq, u32 ack, u32 seq_rtt)
{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- unsigned int actual, expected;
- unsigned int inv_rtt, inv_basertt, inv_basebd;
- u32 snt_bytes;
-
- /* From:
- * TCP Vegas: New Techniques for Congestion
- * Detection and Avoidance.
- *
- * Warning: This code is a scratch implementation taken
- * from the paper only. The code they distribute seams
- * to have improved several things over the initial spec.
- */
-
- if (!seq_rtt)
- seq_rtt = 1;
-
- if (tp->basertt)
- tp->basertt = min(seq_rtt, tp->basertt);
- else
- tp->basertt = seq_rtt;
-
- /* actual = throughput for this segment.
- * expected = number_of_bytes in transit / BaseRTT
- */
-
- snt_bytes = ack - seq;
-
- inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt;
- inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt;
-
- actual = snt_bytes * inv_rtt;
-
- expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
-
- inv_basebd = sk->mss * inv_basertt;
-
- /* Slow Start */
- if (tp->snd_cwnd < tp->snd_ssthresh &&
- (seq == tp->snd_nxt ||
- (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) {
- /* "Vegas allows exponential growth only every other RTT" */
- if (tp->snd_cwnd_cnt++) {
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
- } else {
- /* Congestion Avoidance */
- if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) {
- /* Increase Linearly */
- if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
- }
-
- if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) {
- /* Decrease Linearly */
- if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
- tp->snd_cwnd--;
- tp->snd_cwnd_cnt = 0;
- }
-
- /* Never less than 2 segments. */
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- }
- }
-}
-
-static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- /* This is Jacobson's slow start and congestion avoidance.
- * SIGCOMM '88, p. 328. Because we keep cong_window in
- * integral mss's, we can't do cwnd += 1 / cwnd.
- * Instead, maintain a counter and increment it once every
- * cwnd times.
- * FIXME: Check to be sure the mathematics works out right
- * on this trick when we have to reduce the congestion window.
- * The snd_cwnd_cnt has to be reset properly when reduction events
- * happen.
- * FIXME: What happens when the congestion window gets larger
- * than the maximum receiver window by some large factor
- * Suppose the pipeline never looses packets for a long
- * period of time, then traffic increases causing packet loss.
- * The congestion window should be reduced, but what it should
- * be reduced to is not clear, since 1/2 the old window may
- * still be larger than the maximum sending rate we ever achieved.
- */
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if ((tp->snd_cwnd>>TCP_CWND_SHIFT) <= tp->snd_ssthresh) {
/* In "safe" area, increase. */
- tp->snd_cwnd++;
+ tp->snd_cwnd += (1 << TCP_CWND_SHIFT);
} else {
- /* In dangerous area, increase slowly. In theory this is
- * tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ /* In dangerous area, increase slowly. */
+ tp->snd_cwnd += 1;
}
}
@@ -628,11 +610,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
int acked = 0;
while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
/* If our packet is before the ack sequence we can
* discard it as it's confirmed to have arrived at
* the other end.
*/
- if (after(skb->end_seq, ack))
+ if (after(scb->end_seq, ack))
break;
/* Initial outgoing SYN's get put onto the write_queue
@@ -642,17 +626,31 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
+ if(!(scb->flags & TCPCB_FLAG_SYN)) {
+ __u8 sacked = scb->sacked;
+
acked |= FLAG_DATA_ACKED;
- if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ if(sacked & TCPCB_SACKED_RETRANS) {
acked |= FLAG_RETRANS_DATA_ACKED;
+
+ /* XXX The race is, fast retrans frame -->
+ * XXX retrans timeout sends older frame -->
+ * XXX ACK arrives for fast retrans frame -->
+ * XXX retrans_out goes negative --> splat.
+ * XXX Please help me find a better way -DaveM
+ */
+ if(tp->retrans_out)
+ tp->retrans_out--;
+ }
+ if(tp->fackets_out)
+ tp->fackets_out--;
} else {
tp->retrans_head = NULL;
}
tp->packets_out--;
- *seq = skb->seq;
- *seq_rtt = now - skb->when;
- skb_unlink(skb);
+ *seq = scb->seq;
+ *seq_rtt = now - scb->when;
+ __skb_unlink(skb, skb->list);
kfree_skb(skb);
}
@@ -672,7 +670,7 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
/* should always be non-null */
if (tp->send_head != NULL &&
- !before (ack + tp->snd_wnd, tp->send_head->end_seq)) {
+ !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
tp->backoff = 0;
tp->pending = 0;
tcp_clear_xmit_timer(sk, TIME_PROBE0);
@@ -688,11 +686,26 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
u32 seq, u32 ack, int flag)
{
- __u32 seq_rtt = (jiffies-tp->rcv_tsecr);
+ __u32 seq_rtt;
+
+ /* RTTM Rule: A TSecr value received in a segment is used to
+ * update the averaged RTT measurement only if the segment
+ * acknowledges some new data, i.e., only if it advances the
+ * left edge of the send window.
+ *
+ * See draft-ietf-tcplw-high-performance-00, section 3.3.
+ * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+ */
+ if (!(flag & FLAG_DATA_ACKED))
+ return;
+
+ seq_rtt = jiffies-tp->rcv_tsecr;
tcp_rtt_estimator(tp, seq_rtt);
if (tp->retransmits) {
if (tp->packets_out == 0) {
tp->retransmits = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
tp->backoff = 0;
tcp_set_rto(tp);
} else {
@@ -702,8 +715,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
}
} else {
tcp_set_rto(tp);
- if (flag & FLAG_DATA_ACKED)
- (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ tcp_cong_avoid(tp, seq, ack, seq_rtt);
}
/* NOTE: safe here so long as cong_ctl doesn't use rto */
tcp_bound_rto(tp);
@@ -712,7 +724,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
struct sk_buff *skb = skb_peek(&sk->write_queue);
- long when = tp->rto - (jiffies - skb->when);
+ long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
/* Some data was ACK'd, if still retransmitting (due to a
* timeout), resend more of the retransmit queue. The
@@ -801,8 +813,11 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
} else {
/* If we were retransmiting don't count rtt estimate. */
if (tp->retransmits) {
- if (tp->packets_out == 0)
+ if (tp->packets_out == 0) {
tp->retransmits = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
+ }
} else {
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -812,13 +827,14 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
- if ((flag & FLAG_DATA_ACKED) &&
- !(flag & FLAG_RETRANS_DATA_ACKED)) {
- tp->backoff = 0;
- tcp_rtt_estimator(tp, seq_rtt);
- tcp_set_rto(tp);
- tcp_bound_rto(tp);
- (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ if (flag & FLAG_DATA_ACKED) {
+ if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
+ tp->backoff = 0;
+ tcp_rtt_estimator(tp, seq_rtt);
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+ }
+ tcp_cong_avoid(tp, seq, ack, seq_rtt);
}
}
}
@@ -848,13 +864,12 @@ uninteresting_ack:
}
/* New-style handling of TIME_WAIT sockets. */
-static void tcp_timewait_kill(unsigned long __arg)
-{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg;
-
- /* Zap the timer. */
- del_timer(&tw->timer);
+extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
+void tcp_timewait_kill(struct tcp_tw_bucket *tw)
+{
/* Unlink from various places. */
if(tw->bind_next)
tw->bind_next->bind_pprev = tw->bind_pprev;
@@ -898,7 +913,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* (2) returns to TIME-WAIT state if the SYN turns out
* to be an old duplicate".
*/
- if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
+ if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
struct sock *sk;
struct tcp_func *af_specific = tw->af_specific;
__u32 isn;
@@ -906,7 +921,8 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
isn = tw->rcv_nxt + 128000;
if(isn == 0)
isn++;
- tcp_timewait_kill((unsigned long)tw);
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
sk = af_specific->get_sock(skb, th);
if(sk == NULL || !ipsec_sk_policy(sk,skb))
return 0;
@@ -923,16 +939,16 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
- if(sysctl_tcp_rfc1337 == 0)
- tcp_timewait_kill((unsigned long)tw);
-
+ if(sysctl_tcp_rfc1337 == 0) {
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ }
if(!th->rst)
return 1; /* toss a reset back */
} else {
- if(th->ack) {
- /* In this case we must reset the TIMEWAIT timer. */
- mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN);
- }
+ /* In this case we must reset the TIMEWAIT timer. */
+ if(th->ack)
+ tcp_tw_reschedule(tw);
}
return 0; /* Discard the frame. */
}
@@ -1008,11 +1024,7 @@ void tcp_time_wait(struct sock *sk)
tcp_tw_hashdance(sk, tw);
/* Get the TIME_WAIT timeout firing. */
- init_timer(&tw->timer);
- tw->timer.function = tcp_timewait_kill;
- tw->timer.data = (unsigned long) tw;
- tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
- add_timer(&tw->timer);
+ tcp_tw_schedule(tw);
/* CLOSE the SK. */
if(sk->state == TCP_ESTABLISHED)
@@ -1051,7 +1063,7 @@ void tcp_time_wait(struct sock *sk)
static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
- sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;
+ sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
tcp_send_ack(sk);
@@ -1174,14 +1186,14 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
* "in order". ;-) This also satisfies the requirements
* of RFC2018 about ordering of SACKs.
*/
- if(sp->end_seq == skb->seq) {
- sp->end_seq = skb->end_seq;
+ if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
+ sp->end_seq = TCP_SKB_CB(skb)->end_seq;
tcp_sack_maybe_coalesce(tp, sp);
- } else if(sp->start_seq == skb->end_seq) {
+ } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
/* Re-ordered arrival, in this case, can be optimized
* as well.
*/
- sp->start_seq = skb->seq;
+ sp->start_seq = TCP_SKB_CB(skb)->seq;
tcp_sack_maybe_coalesce(tp, sp);
} else {
int cur_sacks = tp->num_sacks;
@@ -1195,12 +1207,12 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
int this_sack;
for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
- if((swap->end_seq == skb->seq) ||
- (swap->start_seq == skb->end_seq)) {
- if(swap->end_seq == skb->seq)
- swap->end_seq = skb->end_seq;
+ if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
+ (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
+ if(swap->end_seq == TCP_SKB_CB(skb)->seq)
+ swap->end_seq = TCP_SKB_CB(skb)->end_seq;
else
- swap->start_seq = skb->seq;
+ swap->start_seq = TCP_SKB_CB(skb)->seq;
tcp_sack_swap(sp, swap);
tcp_sack_maybe_coalesce(tp, sp);
return;
@@ -1221,8 +1233,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
}
/* Build head SACK, and we're done. */
- sp->start_seq = skb->seq;
- sp->end_seq = skb->end_seq;
+ sp->start_seq = TCP_SKB_CB(skb)->seq;
+ sp->end_seq = TCP_SKB_CB(skb)->end_seq;
if(tp->num_sacks < max_sacks)
tp->num_sacks++;
}
@@ -1234,9 +1246,14 @@ static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
int num_sacks = tp->num_sacks;
int this_sack;
- /* We know this removed SKB will eat from the front of a SACK. */
+ /* This is an in order data segment _or_ an out-of-order SKB being
+ * moved to the receive queue, so we know this removed SKB will eat
+ * from the front of a SACK.
+ */
for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
- if(sp->start_seq == skb->seq)
+ /* Check if the start of the sack is covered by skb. */
+ if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
+ before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
break;
}
@@ -1247,7 +1264,7 @@ static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
if(this_sack >= num_sacks)
return;
- sp->start_seq = skb->end_seq;
+ sp->start_seq = TCP_SKB_CB(skb)->end_seq;
if(!before(sp->start_seq, sp->end_seq)) {
/* Zap this SACK, by moving forward any other SACKS. */
for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
@@ -1266,12 +1283,12 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct
int this_sack;
for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) {
- if(sp->end_seq == old_skb->end_seq)
+ if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
break;
}
if(this_sack >= num_sacks)
return;
- sp->end_seq = new_skb->end_seq;
+ sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
}
/* This one checks to see if we can put data from the
@@ -1283,23 +1300,24 @@ static void tcp_ofo_queue(struct sock *sk)
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
while ((skb = skb_peek(&tp->out_of_order_queue))) {
- if (after(skb->seq, tp->rcv_nxt))
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
- if (!after(skb->end_seq, tp->rcv_nxt)) {
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received \n");
- skb_unlink(skb);
+ __skb_unlink(skb, skb->list);
kfree_skb(skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, skb->seq, skb->end_seq);
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
if(tp->sack_ok)
tcp_sack_remove_skb(tp, skb);
- skb_unlink(skb);
- skb_queue_tail(&sk->receive_queue, skb);
- tp->rcv_nxt = skb->end_seq;
+ __skb_unlink(skb, skb->list);
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin)
tcp_fin(skb, sk, skb->h.th);
}
@@ -1314,12 +1332,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
* Packets in sequence go to the receive queue.
* Out of sequence packets to out_of_order_queue.
*/
- if (skb->seq == tp->rcv_nxt) {
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
/* Ok. In sequence. */
queue_and_out:
dst_confirm(sk->dst_cache);
- skb_queue_tail(&sk->receive_queue, skb);
- tp->rcv_nxt = skb->end_seq;
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin) {
tcp_fin(skb, sk, skb->h.th);
} else {
@@ -1341,18 +1359,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
}
/* An old packet, either a retransmit or some packet got lost. */
- if (!after(skb->end_seq, tp->rcv_nxt)) {
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an imediate ack. */
- SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
+ SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
tcp_enter_quickack_mode(tp);
kfree_skb(skb);
return;
}
- if (before(skb->seq, tp->rcv_nxt)) {
+ if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, skb->seq, skb->end_seq);
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
goto queue_and_out;
}
@@ -1365,25 +1384,25 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, skb->seq, skb->end_seq);
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (skb_peek(&tp->out_of_order_queue) == NULL) {
/* Initial out of order segment, build 1 SACK. */
if(tp->sack_ok) {
tp->num_sacks = 1;
- tp->selective_acks[0].start_seq = skb->seq;
- tp->selective_acks[0].end_seq = skb->end_seq;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
}
- skb_queue_head(&tp->out_of_order_queue,skb);
+ __skb_queue_head(&tp->out_of_order_queue,skb);
} else {
for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
/* Already there. */
- if (skb->seq == skb1->seq) {
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
if (skb->len >= skb1->len) {
if(tp->sack_ok)
tcp_sack_extend(tp, skb1, skb);
- skb_append(skb1, skb);
- skb_unlink(skb1);
+ __skb_append(skb1, skb);
+ __skb_unlink(skb1, skb1->list);
kfree_skb(skb1);
} else {
/* A duplicate, smaller than what is in the
@@ -1394,8 +1413,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
break;
}
- if (after(skb->seq, skb1->seq)) {
- skb_append(skb1,skb);
+ if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
+ __skb_append(skb1, skb);
if(tp->sack_ok)
tcp_sack_new_ofo_skb(sk, skb);
break;
@@ -1403,7 +1422,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
/* See if we've hit the start. If so insert. */
if (skb1 == skb_peek(&tp->out_of_order_queue)) {
- skb_queue_head(&tp->out_of_order_queue,skb);
+ __skb_queue_head(&tp->out_of_order_queue,skb);
if(tp->sack_ok)
tcp_sack_new_ofo_skb(sk, skb);
break;
@@ -1431,6 +1450,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
if (skb->len == 0 && !th->fin)
return(0);
+ /*
+ * If our receive queue has grown past its limits shrink it.
+ * Make sure to do this before moving snd_nxt, otherwise
+ * data might be acked for that we don't have enough room.
+ */
+ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
+ if (prune_queue(sk) < 0) {
+ /* Still not enough room. That can happen when
+ * skb->true_size differs significantly from skb->len.
+ */
+ return 0;
+ }
+ }
+
/* We no longer have anyone receiving data on this connection. */
tcp_data_queue(sk, skb);
@@ -1455,8 +1488,8 @@ static void tcp_data_snd_check(struct sock *sk)
struct sk_buff *skb;
if ((skb = tp->send_head)) {
- if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
- tp->packets_out < tp->snd_cwnd ) {
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
+ tcp_packets_in_flight(tp) < (tp->snd_cwnd >> TCP_CWND_SHIFT)) {
/* Put more data onto the wire. */
tcp_write_xmit(sk);
} else if (tp->packets_out == 0 && !tp->pending) {
@@ -1488,7 +1521,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
*/
/* Two full frames received or... */
- if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) ||
+ if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) ||
/* We will update the window "significantly" or... */
tcp_raise_window(sk) ||
/* We entered "quick ACK" mode or... */
@@ -1590,7 +1623,7 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
* Clean first the out_of_order queue, then the receive queue until
* the socket is in its memory limits again.
*/
-static void prune_queue(struct sock *sk)
+static int prune_queue(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct sk_buff * skb;
@@ -1601,10 +1634,10 @@ static void prune_queue(struct sock *sk)
/* Start with the end because there are probably the least
* useful packets (crossing fingers).
*/
- while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) {
+ while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) {
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
- return;
+ return 0;
}
/* Now continue with the receive queue if it wasn't enough */
@@ -1616,19 +1649,22 @@ static void prune_queue(struct sock *sk)
break;
/* Never remove packets that have been already acked */
- if (before(skb->end_seq, tp->last_ack_sent+1)) {
- printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
- tp->copied_seq, skb->end_seq, tp->last_ack_sent);
- break;
+ if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) {
+ SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
+ tp->last_ack_sent);
+ return -1;
}
- skb_unlink(skb);
- tp->rcv_nxt = skb->seq;
+ __skb_unlink(skb, skb->list);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
- skb->seq, skb->end_seq, tp->copied_seq);
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tp->copied_seq);
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
break;
}
+ return 0;
}
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
@@ -1658,13 +1694,13 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
if (tcp_fast_parse_options(sk, th, tp)) {
if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp)) {
+ if (tcp_paws_discard(tp, th, len)) {
if (!th->rst) {
tcp_send_ack(sk);
goto discard;
}
}
- tcp_replace_ts_recent(tp,skb->end_seq);
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq);
}
}
@@ -1678,11 +1714,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
* space for instance)
*/
- if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) {
+ if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (len <= th->doff*4) {
/* Bulk data transfer: sender */
if (len == th->doff*4) {
- tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
@@ -1690,7 +1727,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_statistics.TcpInErrs++;
goto discard;
}
- } else if (skb->ack_seq == tp->snd_una) {
+ } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
/* Bulk data transfer: receiver */
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
goto discard;
@@ -1700,8 +1737,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* DO NOT notify forward progress here.
* It saves dozen of CPU instructions in fast path. --ANK
*/
- skb_queue_tail(&sk->receive_queue, skb);
- tp->rcv_nxt = skb->end_seq;
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
/* FIN bit check is not done since if FIN is set in
* this frame, the pred_flags won't match up. -DaveM
@@ -1719,11 +1756,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
}
- if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
if (!th->rst) {
- if (after(skb->seq, tp->rcv_nxt)) {
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
- skb->seq, skb->end_seq,
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tp->rcv_wup, tp->rcv_wnd);
}
tcp_send_ack(sk);
@@ -1731,7 +1768,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
}
- if(th->syn && skb->seq != tp->syn_seq) {
+ if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
SOCK_DEBUG(sk, "syn in established state\n");
tcp_statistics.TcpInErrs++;
tcp_reset(sk, skb);
@@ -1744,7 +1781,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
if(th->ack)
- tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
/* Process urgent data. */
tcp_urg(sk, th, len);
@@ -1752,13 +1789,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* step 7: process the segment text */
queued = tcp_data(skb, sk, len);
- tcp_data_snd_check(sk);
-
- /* If our receive queue has grown past its limits shrink it */
- if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
- prune_queue(sk);
-
- tcp_ack_snd_check(sk);
+ /* Be careful, tcp_data() may have put this into TIME_WAIT. */
+ if(sk->state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
if (!queued) {
discard:
@@ -1768,42 +1803,44 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
return 0;
}
-/* Shared between IPv4 and IPv6 now. */
-struct sock *
-tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req)
+/*
+ * Process an incoming SYN or SYN-ACK.
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct open_request *req)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ u32 flg;
/* assumption: the socket is not in use.
* as we checked the user count on tcp_rcv and we're
* running from a soft interrupt.
*/
+ /* Check for syn retransmission */
+ flg = *(((u32 *)skb->h.th) + 3);
+
+ flg &= __constant_htonl(0x00170000);
+ /* Only SYN set? */
+ if (flg == __constant_htonl(0x00020000)) {
+ if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
+ /* retransmited syn.
+ */
+ req->class->rtx_syn_ack(sk, req);
+ return NULL;
+ } else {
+ return sk; /* Pass new SYN to the listen socket. */
+ }
+ }
+
+ /* We know it's an ACK here */
if (req->sk) {
/* socket already created but not
* yet accepted()...
*/
sk = req->sk;
} else {
- u32 flg;
-
- /* Check for syn retransmission */
- flg = *(((u32 *)skb->h.th) + 3);
-
- flg &= __constant_htonl(0x00170000);
- /* Only SYN set? */
- if (flg == __constant_htonl(0x00020000)) {
- if (!after(skb->seq, req->rcv_isn)) {
- /* retransmited syn.
- */
- req->class->rtx_syn_ack(sk, req);
- return NULL;
- } else {
- return sk; /* New SYN */
- }
- }
-
- /* We know it's an ACK here */
/* In theory the packet could be for a cookie, but
* TIME_WAIT should guard us against this.
* XXX: Nevertheless check for cookies?
@@ -1811,8 +1848,8 @@ tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req)
* but we do it here to prevent syn flood attackers
* from creating big SYN_RECV sockets.
*/
- if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) ||
- !between(skb->seq, req->rcv_isn,
+ if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
+ !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
req->rcv_isn+1+req->rcv_wnd)) {
req->class->send_reset(skb);
return NULL;
@@ -1885,10 +1922,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* not be in line code. [AC]
*/
if(th->ack) {
- tp->snd_wl1 = skb->seq;
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
/* We got an ack, but it's not a good ack. */
- if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) {
+ if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len)) {
+ sk->err = ECONNRESET;
+ sk->state_change(sk);
tcp_statistics.TcpAttemptFails++;
return 1;
}
@@ -1902,6 +1942,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* A valid ack from a different connection
* start. Shouldn't happen but cover it.
*/
+ sk->err = ECONNRESET;
+ sk->state_change(sk);
tcp_statistics.TcpAttemptFails++;
return 1;
}
@@ -1909,13 +1951,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = skb->seq+1;
- tp->rcv_wup = skb->seq+1;
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
- tp->snd_wnd = htons(th->window) << tp->snd_wscale;
- tp->snd_wl1 = skb->seq;
- tp->snd_wl2 = skb->ack_seq;
- tp->fin_seq = skb->seq;
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ tp->fin_seq = TCP_SKB_CB(skb)->seq;
tcp_set_state(sk, TCP_ESTABLISHED);
tcp_parse_options(sk, th, tp, 0);
@@ -1924,6 +1969,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_wscale = tp->rcv_wscale = 0;
tp->window_clamp = min(tp->window_clamp,65535);
}
+
if (tp->tstamp_ok) {
tp->tcp_header_len =
sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
@@ -1983,11 +2029,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->ts_recent_stamp = jiffies;
}
- tp->rcv_nxt = skb->seq + 1;
- tp->rcv_wup = skb->seq + 1;
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
tp->snd_wnd = htons(th->window);
- tp->snd_wl1 = skb->seq;
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tcp_send_synack(sk);
goto discard;
@@ -2008,18 +2057,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* guarantee this.
*/
if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp)) {
+ if (tcp_paws_discard(tp, th, len)) {
if (!th->rst) {
tcp_send_ack(sk);
goto discard;
}
}
- tcp_replace_ts_recent(tp,skb->end_seq);
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq);
}
}
/* step 1: check sequence number */
- if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
if (!th->rst) {
tcp_send_ack(sk);
goto discard;
@@ -2050,14 +2099,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* original syn.
*/
- if (th->syn && skb->seq!=tp->syn_seq) {
+ if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
tcp_reset(sk, skb);
return 1;
}
/* step 5: check the ACK field */
if (th->ack) {
- int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
+ int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
switch(sk->state) {
case TCP_SYN_RECV:
@@ -2069,10 +2119,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if(!sk->dead)
sk->state_change(sk);
- tp->snd_una = skb->ack_seq;
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = htons(th->window) << tp->snd_wscale;
- tp->snd_wl1 = skb->seq;
- tp->snd_wl2 = skb->ack_seq;
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
} else {
SOCK_DEBUG(sk, "bad ack\n");
@@ -2092,8 +2142,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
break;
case TCP_CLOSING:
- if (tp->snd_una == tp->write_seq)
+ if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk);
+ goto discard;
+ }
break;
case TCP_LAST_ACK:
@@ -2117,7 +2169,7 @@ step6:
switch (sk->state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
- if (!before(skb->seq, tp->fin_seq))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
break;
case TCP_FIN_WAIT1:
@@ -2127,7 +2179,7 @@ step6:
* BSD 4.4 also does reset.
*/
if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
- if (after(skb->end_seq - th->fin, tp->rcv_nxt)) {
+ if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
tcp_reset(sk, skb);
return 1;
}
@@ -2135,10 +2187,6 @@ step6:
case TCP_ESTABLISHED:
queued = tcp_data(skb, sk, len);
-
- /* This can only happen when MTU+skbheader > rcvbuf */
- if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
- prune_queue(sk);
break;
}
@@ -2151,26 +2199,3 @@ discard:
}
return 0;
}
-
-int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp,
- void *buffer, size_t *lenp)
-{
- int val = sysctl_tcp_cong_avoidance;
- int retv;
- static tcp_sys_cong_ctl_t tab[] = {
- tcp_cong_avoid_vanj,
- tcp_cong_avoid_vegas
- };
-
- retv = proc_dointvec(ctl, write, filp, buffer, lenp);
-
- if (write) {
- if ((unsigned)sysctl_tcp_cong_avoidance > 1) {
- retv = -EINVAL;
- sysctl_tcp_cong_avoidance = val;
- } else {
- tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance];
- }
- }
- return retv;
-}