summaryrefslogtreecommitdiffstats
path: root/include/net/tcp.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r--include/net/tcp.h607
1 files changed, 376 insertions, 231 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7df845895..d3a63962c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -19,9 +19,13 @@
#define _TCP_H
#define TCP_DEBUG 1
+#define FASTRETRANS_DEBUG 1
+
+/* Be paranoid about data immediately beyond right edge of window. */
#undef TCP_FORMAL_WINDOW
-#define TCP_MORE_COARSE_ACKS
-#undef TCP_LESS_COARSE_ACKS
+
+/* Cancel timers, when they are not required. */
+#undef TCP_CLEAR_TIMERS
#include <linux/config.h>
#include <linux/tcp.h>
@@ -173,7 +177,7 @@ extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw)
}
extern atomic_t tcp_orphan_count;
-extern int tcp_tw_count;
+extern int tcp_tw_count;
extern void tcp_time_wait(struct sock *sk, int state, int timeo);
extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
@@ -242,12 +246,14 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
/* Minimal RCV_MSS. */
#define TCP_MIN_RCVMSS 536
-/*
- * How much of the receive buffer do we advertize
- * (the rest is reserved for headers and driver packet overhead)
- * Use a power of 2.
- */
-#define TCP_WINDOW_ADVERTISE_DIVISOR 2
+/* After receiving this amount of duplicate ACKs fast retransmit starts. */
+#define TCP_FASTRETRANS_THRESH 3
+
+/* Maximal reordering. */
+#define TCP_MAX_REORDERING 127
+
+/* Maximal number of ACKs sent quickly to accelerate slow-start. */
+#define TCP_MAX_QUICKACKS 16
/* urg_data states */
#define TCP_URG_VALID 0x0100
@@ -292,7 +298,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
#define TCP_DELACK_MAX (HZ/5) /* maximal time to delay before sending an ACK */
#define TCP_DELACK_MIN (2) /* minimal time to delay before sending an ACK,
* 2 scheduler ticks, not depending on HZ. */
-#define TCP_ATO_MAX (HZ/2) /* Clamp ATO estimator at his value. */
#define TCP_ATO_MIN 2
#define TCP_RTO_MAX (120*HZ)
#define TCP_RTO_MIN (HZ/5)
@@ -414,6 +419,19 @@ extern int sysctl_tcp_tw_recycle;
extern int sysctl_tcp_abort_on_overflow;
extern int sysctl_tcp_max_orphans;
extern int sysctl_tcp_max_tw_buckets;
+extern int sysctl_tcp_fack;
+extern int sysctl_tcp_reordering;
+extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_dsack;
+extern int sysctl_tcp_mem[3];
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+extern int sysctl_tcp_app_win;
+extern int sysctl_tcp_adv_win_scale;
+
+extern atomic_t tcp_memory_allocated;
+extern atomic_t tcp_sockets_allocated;
+extern int tcp_memory_pressure;
struct open_request;
@@ -606,6 +624,23 @@ extern int tcp_rcv_established(struct sock *sk,
struct tcphdr *th,
unsigned len);
+enum tcp_ack_state_t
+{
+ TCP_ACK_SCHED = 1,
+ TCP_ACK_TIMER = 2,
+ TCP_ACK_PUSHED= 4
+};
+
+static inline void tcp_schedule_ack(struct tcp_opt *tp)
+{
+ tp->ack.pending |= TCP_ACK_SCHED;
+}
+
+static inline int tcp_ack_scheduled(struct tcp_opt *tp)
+{
+ return tp->ack.pending&TCP_ACK_SCHED;
+}
+
static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
{
if (tp->ack.quick && --tp->ack.quick == 0) {
@@ -614,11 +649,27 @@ static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
}
}
+extern void tcp_enter_quickack_mode(struct tcp_opt *tp);
+
static __inline__ void tcp_delack_init(struct tcp_opt *tp)
{
memset(&tp->ack, 0, sizeof(tp->ack));
}
+enum tcp_ca_state
+{
+ TCP_CA_Open = 0,
+#define TCPF_CA_Open (1<<TCP_CA_Open)
+ TCP_CA_Disorder = 1,
+#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
+ TCP_CA_CWR = 2,
+#define TCPF_CA_CWR (1<<TCP_CA_CWR)
+ TCP_CA_Recovery = 3,
+#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
+ TCP_CA_Loss = 4
+#define TCPF_CA_Loss (1<<TCP_CA_Loss)
+};
+
enum tcp_tw_status
{
@@ -640,6 +691,9 @@ extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
extern int tcp_child_process(struct sock *parent,
struct sock *child,
struct sk_buff *skb);
+extern void tcp_enter_loss(struct sock *sk, int how);
+extern void tcp_clear_retrans(struct tcp_opt *tp);
+extern void tcp_update_metrics(struct sock *sk);
extern void tcp_close(struct sock *sk,
long timeout);
@@ -661,8 +715,8 @@ extern int tcp_recvmsg(struct sock *sk,
extern int tcp_listen_start(struct sock *sk);
-extern void tcp_parse_options(struct sock *sk, struct tcphdr *th,
- struct tcp_opt *tp, int no_fancy);
+extern void tcp_parse_options(struct sk_buff *skb,
+ struct tcp_opt *tp);
/*
* TCP v4 functions exported for the inet6 API
@@ -720,7 +774,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
extern int tcp_write_xmit(struct sock *);
extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
-extern void tcp_fack_retransmit(struct sock *);
extern void tcp_xmit_retransmit_queue(struct sock *);
extern void tcp_simple_retransmit(struct sock *);
@@ -736,7 +789,6 @@ extern void tcp_send_ack(struct sock *sk);
extern void tcp_send_delayed_ack(struct sock *sk);
/* tcp_timer.c */
-extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long);
extern void tcp_init_xmit_timers(struct sock *);
extern void tcp_clear_xmit_timers(struct sock *);
@@ -744,6 +796,76 @@ extern void tcp_delete_keepalive_timer (struct sock *);
extern void tcp_reset_keepalive_timer (struct sock *, unsigned long);
extern int tcp_sync_mss(struct sock *sk, u32 pmtu);
+extern const char timer_bug_msg[];
+
+
+static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ switch (what) {
+ case TCP_TIME_RETRANS:
+ case TCP_TIME_PROBE0:
+ tp->pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+ if (timer_pending(&tp->retransmit_timer) &&
+ del_timer(&tp->retransmit_timer))
+ __sock_put(sk);
+#endif
+ break;
+ case TCP_TIME_DACK:
+ tp->ack.blocked = 0;
+ tp->ack.pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+ if (timer_pending(&tp->delack_timer) &&
+ del_timer(&tp->delack_timer))
+ __sock_put(sk);
+#endif
+ break;
+ default:
+ printk(timer_bug_msg);
+ return;
+ };
+
+}
+
+/*
+ * Reset the retransmission timer
+ */
+static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ if (when > TCP_RTO_MAX) {
+#ifdef TCP_DEBUG
+ printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
+#endif
+ when = TCP_RTO_MAX;
+ }
+
+ switch (what) {
+ case TCP_TIME_RETRANS:
+ case TCP_TIME_PROBE0:
+ tp->pending = what;
+ tp->timeout = jiffies+when;
+ if (!mod_timer(&tp->retransmit_timer, tp->timeout))
+ sock_hold(sk);
+ break;
+
+ case TCP_TIME_DACK:
+ tp->ack.pending |= TCP_ACK_TIMER;
+ tp->ack.timeout = jiffies+when;
+ if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
+ sock_hold(sk);
+ break;
+
+ default:
+ printk(KERN_DEBUG "bug: unknown timer value\n");
+ };
+}
+
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
@@ -757,9 +879,9 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
if (dst && dst->pmtu != tp->pmtu_cookie)
mss_now = tcp_sync_mss(sk, dst->pmtu);
- if(tp->sack_ok && tp->num_sacks)
+ if (tp->eff_sacks)
mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
return mss_now;
}
@@ -774,15 +896,8 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct dst_entry *dst = __sk_dst_get(sk);
- int mss;
- if (dst)
- mss = dst->advmss;
- else
- mss = tp->mss_cache;
-
- tp->ack.rcv_mss = max(min(mss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
+ tp->ack.rcv_mss = max(min(tp->advmss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
}
static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd)
@@ -797,9 +912,6 @@ static __inline__ void tcp_fast_path_on(struct tcp_opt *tp)
__tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale);
}
-
-
-
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
@@ -819,52 +931,6 @@ static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
*/
extern u32 __tcp_select_window(struct sock *sk);
-/* Chose a new window to advertise, update state in tcp_opt for the
- * socket, and return result with RFC1323 scaling applied. The return
- * value can be stuffed directly into th->window for an outgoing
- * frame.
- */
-extern __inline__ u16 tcp_select_window(struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- u32 cur_win = tcp_receive_window(tp);
- u32 new_win = __tcp_select_window(sk);
-
- /* Never shrink the offered window */
- if(new_win < cur_win) {
- /* Danger Will Robinson!
- * Don't update rcv_wup/rcv_wnd here or else
- * we will not be able to advertise a zero
- * window in time. --DaveM
- *
- * Relax Will Robinson.
- */
- new_win = cur_win;
- }
- tp->rcv_wnd = new_win;
- tp->rcv_wup = tp->rcv_nxt;
-
- /* RFC1323 scaling applied */
- new_win >>= tp->rcv_wscale;
-
-#ifdef TCP_FORMAL_WINDOW
- if (new_win == 0) {
- /* If we advertise zero window, disable fast path. */
- tp->pred_flags = 0;
- } else if (cur_win == 0 && tp->pred_flags == 0 &&
- skb_queue_len(&tp->out_of_order_queue) == 0 &&
- !tp->urg_data) {
- /* If we open zero window, enable fast path.
- Without this it will be open by the first data packet,
- it is too late to merge checksumming to copy.
- */
- tcp_fast_path_on(tp);
- }
-#endif
-
- return new_win;
-}
-
/* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decidely
@@ -907,6 +973,12 @@ struct tcp_skb_cb {
__u8 sacked; /* State flags for SACK/FACK. */
#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
+#define TCPCB_LOST 0x04 /* SKB is lost */
+#define TCPCB_TAGBITS 0x07 /* All tag bits */
+
+#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
+#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+
__u16 urg_ptr; /* Valid w/URG flags is set. */
__u32 ack_seq; /* Sequence number ACK'd */
@@ -914,11 +986,28 @@ struct tcp_skb_cb {
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
+#define for_retrans_queue(skb, sk, tp) \
+ for (skb = (sk)->write_queue.next; \
+ (skb != (tp)->send_head) && \
+ (skb != (struct sk_buff *)&(sk)->write_queue); \
+ skb=skb->next)
+
+
+#include <net/tcp_ecn.h>
+
+
/*
* Compute minimal free write space needed to queue new packets.
*/
-#define tcp_min_write_space(__sk) \
- (atomic_read(&(__sk)->wmem_alloc) / 2)
+static inline int tcp_min_write_space(struct sock *sk)
+{
+ return sk->wmem_queued/2;
+}
+
+static inline int tcp_wspace(struct sock *sk)
+{
+ return sk->sndbuf - sk->wmem_queued;
+}
/* This determines how many packets are "in the network" to the best
@@ -932,89 +1021,97 @@ struct tcp_skb_cb {
* Read this equation as:
*
* "Packets sent once on transmission queue" MINUS
- * "Packets acknowledged by FACK information" PLUS
+ * "Packets left network, but not honestly ACKed yet" PLUS
* "Packets fast retransmitted"
*/
static __inline__ int tcp_packets_in_flight(struct tcp_opt *tp)
{
- return tp->packets_out - tp->fackets_out + tp->retrans_out;
+ return tp->packets_out - tp->left_out + tp->retrans_out;
}
/* Recalculate snd_ssthresh, we want to set it to:
*
* one half the current congestion window, but no
* less than two segments
- *
- * We must take into account the current send window
- * as well, however we keep track of that using different
- * units so a conversion is necessary. -DaveM
- *
- * RED-PEN.
- * RFC 2581: "an easy mistake to make is to simply use cwnd,
- * rather than FlightSize"
- * I see no references to FlightSize here. snd_wnd is not FlightSize,
- * it is also apriory characteristics.
- *
- * FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ?
*/
extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
{
- u32 FlightSize = (tp->snd_nxt - tp->snd_una)/tp->mss_cache;
+ return max(tp->snd_cwnd>>1, 2);
+}
+
+/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
+ * The exception is rate halving phase, when cwnd is decreasing towards
+ * ssthresh.
+ */
+extern __inline__ __u32 tcp_current_ssthresh(struct tcp_opt *tp)
+{
+ if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery))
+ return tp->snd_ssthresh;
+ else
+ return max(tp->snd_ssthresh, (tp->snd_cwnd>>1)+(tp->snd_cwnd>>2));
+}
+
+extern void tcp_cwnd_application_limited(struct sock *sk);
- FlightSize = min(FlightSize, tcp_packets_in_flight(tp));
+/* Congestion window validation. (RFC2861) */
- return max(min(FlightSize, tp->snd_cwnd) >> 1, 2);
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
+{
+ if (tp->packets_out >= tp->snd_cwnd) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
+
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+ tcp_cwnd_application_limited(sk);
+ }
}
/* Set slow start threshould and cwnd not falling to slow start */
-extern __inline__ void __tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void __tcp_enter_cwr(struct tcp_opt *tp)
{
+ tp->undo_marker = 0;
tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
tp->snd_cwnd_cnt = 0;
tp->high_seq = tp->snd_nxt;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ TCP_ECN_queue_cwr(tp);
}
-extern __inline__ void tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void tcp_enter_cwr(struct tcp_opt *tp)
{
- if (!tp->high_seq || after(tp->snd_nxt, tp->high_seq))
- __tcp_enter_cong_avoid(tp);
+ tp->prior_ssthresh = 0;
+ if (tp->ca_state < TCP_CA_CWR) {
+ __tcp_enter_cwr(tp);
+ tp->ca_state = TCP_CA_CWR;
+ }
}
+extern __u32 tcp_init_cwnd(struct tcp_opt *tp);
-/* Increase initial CWND conservatively, i.e. only if estimated
- RTT is low enough. It is not quite correct, we should use
- POWER i.e. RTT*BANDWIDTH, but we still cannot estimate this.
-
- Numbers are taken from RFC1414.
+/* Slow start with delack produces 3 packets of burst, so that
+ * it is safe "de facto".
*/
-static __inline__ __u32 tcp_init_cwnd(struct tcp_opt *tp)
+static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp)
{
- __u32 cwnd;
-
- if (!tp->srtt || tp->srtt > ((HZ/50)<<3) || tp->mss_cache > 1460)
- cwnd = 2;
- else if (tp->mss_cache > 1095)
- cwnd = 3;
- else
- cwnd = 4;
-
- return min(cwnd, tp->snd_cwnd_clamp);
+ return 3;
}
-
static __inline__ int tcp_minshall_check(struct tcp_opt *tp)
{
return after(tp->snd_sml,tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
-static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, int len)
+static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb)
{
- if (len < mss)
- tp->snd_sml = tp->snd_nxt;
+ if (skb->len < mss)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
/* Return 0, if packet can be sent now without violation Nagle's rules:
@@ -1041,17 +1138,6 @@ static __inline__ int tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, u
static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
unsigned cur_mss, int tail)
{
- /*
- * Reset CWND after idle period longer RTO to "restart window".
- * It is "side" effect of the function, which is _not_ good
- * from viewpoint of clarity. But we have to make it before
- * checking congestion window below. Alternative is to prepend
- * all the calls with this test.
- */
- if (tp->packets_out==0 &&
- (s32)(tcp_time_stamp - tp->lsndtime) > tp->rto)
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_init_cwnd(tp));
-
/* RFC 1122 - section 4.2.3.4
*
* We must queue if
@@ -1062,8 +1148,7 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
* (part of SWS is done on packetization)
* Minshall version sounds: there are no _small_
* segments in flight. (tcp_nagle_check)
- * c) We are retransmiting [Nagle]
- * d) We have too many packets 'in flight'
+ * c) We have too many packets 'in flight'
*
* Don't use the nagle rule for urgent data (or
* for the final FIN -DaveM).
@@ -1081,13 +1166,12 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
skb_tailroom(skb) < 32) &&
((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
- !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
- tp->retransmits == 0);
+ !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
}
static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
{
- if (!tp->packets_out && !timer_pending(&tp->probe_timer))
+ if (!tp->packets_out && !tp->pending)
tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
}
@@ -1111,6 +1195,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
tcp_write_xmit(sk))
tcp_check_probe_timer(sk, tp);
}
+ tcp_cwnd_validate(sk, tp);
}
static __inline__ void tcp_push_pending_frames(struct sock *sk,
@@ -1119,6 +1204,24 @@ static __inline__ void tcp_push_pending_frames(struct sock *sk,
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk));
}
+static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb = tp->send_head;
+
+ return (skb &&
+ tcp_snd_test(tp, skb, tcp_current_mss(sk), tcp_skb_is_last(sk, skb)));
+}
+
+static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+ tp->snd_wl1 = seq;
+}
+
+static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+ tp->snd_wl1 = seq;
+}
+
extern void tcp_destroy_sock(struct sock *sk);
@@ -1143,7 +1246,6 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
__tcp_checksum_complete(skb);
}
-
/* Prequeue for VJ style copy to user, combined with checksumming. */
static __inline__ void tcp_prequeue_init(struct tcp_opt *tp)
@@ -1167,12 +1269,15 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (tp->ucopy.task) {
if ((tp->ucopy.memory += skb->truesize) <= (sk->rcvbuf<<1)) {
__skb_queue_tail(&tp->ucopy.prequeue, skb);
- if (skb_queue_len(&tp->ucopy.prequeue) == 1)
+ if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible(sk->sleep);
+ if (!tcp_ack_scheduled(tp))
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
+ }
} else {
NET_INC_STATS_BH(TCPPrequeueDropped);
tp->ucopy.memory -= skb->truesize;
- kfree_skb(skb);
+ __kfree_skb(skb);
}
return 1;
}
@@ -1231,6 +1336,13 @@ static __inline__ void tcp_done(struct sock *sk)
tcp_destroy_sock(sk);
}
+static __inline__ void tcp_sack_reset(struct tcp_opt *tp)
+{
+ tp->dsack = 0;
+ tp->eff_sacks = 0;
+ tp->num_sacks = 0;
+}
+
static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp)
{
if (tp->tstamp_ok) {
@@ -1241,17 +1353,22 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *
*ptr++ = htonl(tstamp);
*ptr++ = htonl(tp->ts_recent);
}
- if(tp->sack_ok && tp->num_sacks) {
+ if (tp->eff_sacks) {
+ struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)));
- for(this_sack = 0; this_sack < tp->num_sacks; this_sack++) {
- *ptr++ = htonl(tp->selective_acks[this_sack].start_seq);
- *ptr++ = htonl(tp->selective_acks[this_sack].end_seq);
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+ if (tp->dsack) {
+ tp->dsack = 0;
+ tp->eff_sacks--;
}
}
}
@@ -1330,42 +1447,44 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss,
space >>= 1;
(*rcv_wscale)++;
}
+ if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
+ space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2)
+ (*rcv_wscale)--;
+ }
+
+ /* Set initial window to value enough for senders,
+ * following RFC1414. Senders, not following this RFC,
+ * will be satisfied with 2.
+ */
+ if (mss > (1<<*rcv_wscale)) {
+ int init_cwnd = 4;
+ if (mss > 1460*3)
+ init_cwnd = 2;
+ else if (mss > 1460)
+ init_cwnd = 3;
+ if (*rcv_wnd > init_cwnd*mss)
+ *rcv_wnd = init_cwnd*mss;
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp);
}
+static inline int tcp_win_from_space(int space)
+{
+ return sysctl_tcp_adv_win_scale<=0 ?
+ (space>>(-sysctl_tcp_adv_win_scale)) :
+ space - (space>>sysctl_tcp_adv_win_scale);
+}
+
/* Note: caller must be prepared to deal with negative returns */
extern __inline__ int tcp_space(struct sock *sk)
{
- return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) /
- TCP_WINDOW_ADVERTISE_DIVISOR;
+ return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc));
}
extern __inline__ int tcp_full_space( struct sock *sk)
{
- return sk->rcvbuf / TCP_WINDOW_ADVERTISE_DIVISOR;
-}
-
-extern __inline__ void tcp_init_buffer_space(struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int rcvbuf = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
- int sndbuf = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
-
- if (sk->rcvbuf < 3*rcvbuf)
- sk->rcvbuf = min (3*rcvbuf, sysctl_rmem_max);
-
- /* Reserve slack space to reduce jitter of advertised window. */
- if (tp->window_clamp >= tcp_full_space(sk)) {
- int nwin = tcp_full_space(sk) - tp->mss_clamp;
-
- if (nwin >= MAX_TCP_WINDOW && nwin >= 2*tp->advmss)
- tp->window_clamp = nwin;
- }
-
- if (sk->sndbuf < 3*sndbuf)
- sk->sndbuf = min (3*sndbuf, sysctl_wmem_max);
+ return tcp_win_from_space(sk->rcvbuf);
}
extern __inline__ void tcp_acceptq_removed(struct sock *sk)
@@ -1473,61 +1592,85 @@ static __inline__ void tcp_openreq_init(struct open_request *req,
req->snd_wscale = tp->snd_wscale;
req->wscale_ok = tp->wscale_ok;
req->acked = 0;
+ req->ecn_ok = 0;
req->rmt_port = skb->h.th->source;
}
-extern const char timer_bug_msg[];
+#define TCP_MEM_QUANTUM ((int)PAGE_SIZE)
-static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct timer_list *timer;
-
- switch (what) {
- case TCP_TIME_RETRANS:
- timer = &tp->retransmit_timer;
- break;
- case TCP_TIME_DACK:
- tp->ack.blocked = 0;
- timer = &tp->delack_timer;
- break;
- case TCP_TIME_PROBE0:
- timer = &tp->probe_timer;
- break;
- default:
- printk(timer_bug_msg);
- return;
- };
+ sk->tp_pinfo.af_tcp.queue_shrunk = 1;
+ sk->wmem_queued -= skb->truesize;
+ sk->forward_alloc += skb->truesize;
+ __kfree_skb(skb);
+}
- if (timer_pending(timer) && del_timer(timer))
- __sock_put(sk);
+static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb)
+{
+ sk->wmem_queued += skb->truesize;
+ sk->forward_alloc -= skb->truesize;
}
-/* This function does not return reliable answer. Use it only as advice.
- */
+extern void __tcp_mem_reclaim(struct sock *sk);
+extern int tcp_mem_schedule(struct sock *sk, int size, int kind);
-static inline int tcp_timer_is_set(struct sock *sk, int what)
+static inline void tcp_mem_reclaim(struct sock *sk)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- int ret;
+ if (sk->forward_alloc >= TCP_MEM_QUANTUM)
+ __tcp_mem_reclaim(sk);
+}
- switch (what) {
- case TCP_TIME_RETRANS:
- ret = timer_pending(&tp->retransmit_timer);
- break;
- case TCP_TIME_DACK:
- ret = timer_pending(&tp->delack_timer);
- break;
- case TCP_TIME_PROBE0:
- ret = timer_pending(&tp->probe_timer);
- break;
- default:
- ret = 0;
- printk(timer_bug_msg);
- };
- return ret;
+static inline void tcp_enter_memory_pressure(void)
+{
+ if (!tcp_memory_pressure) {
+ NET_INC_STATS(TCPMemoryPressures);
+ tcp_memory_pressure = 1;
+ }
}
+static inline void tcp_moderate_sndbuf(struct sock *sk)
+{
+ if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) {
+ sk->sndbuf = min(sk->sndbuf, sk->wmem_queued/2);
+ sk->sndbuf = max(sk->sndbuf, SOCK_MIN_SNDBUF);
+ }
+}
+
+static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp)
+{
+ struct sk_buff *skb = alloc_skb(size, gfp);
+
+ if (skb) {
+ if (sk->forward_alloc >= (int)skb->truesize ||
+ tcp_mem_schedule(sk, skb->truesize, 0))
+ return skb;
+ __kfree_skb(skb);
+ } else {
+ tcp_enter_memory_pressure();
+ tcp_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
+
+static inline void tcp_writequeue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->write_queue)) != NULL)
+ tcp_free_skb(sk, skb);
+ tcp_mem_reclaim(sk);
+}
+
+extern void tcp_rfree(struct sk_buff *skb);
+
+static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb->sk = sk;
+ skb->destructor = tcp_rfree;
+ atomic_add(skb->truesize, &sk->rmem_alloc);
+ sk->forward_alloc -= skb->truesize;
+}
extern void tcp_listen_wlock(void);
@@ -1570,28 +1713,30 @@ static inline int tcp_fin_time(struct tcp_opt *tp)
return fin_timeout;
}
-#if 0 /* TCP_DEBUG */
-#define TCP_CHECK_TIMER(sk) \
-do { struct tcp_opt *__tp = &sk->tp_pinfo.af_tcp; \
- if (sk->state != TCP_CLOSE) { \
- if (__tp->packets_out) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS) && !timer_is_running(&__tp->retransmit_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p RETRANS" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } else if (__tp->send_head) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_PROBE0) && !timer_is_running(&__tp->probe_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p PROBE0" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } \
- if (__tp->ack.pending) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_DACK) && !timer_is_running(&__tp->delack_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p DACK" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } \
- if (__tp->packets_out > skb_queue_len(&sk->write_queue) || \
- (__tp->send_head && skb_queue_len(&sk->write_queue) == 0)) { \
- printk(KERN_DEBUG "sk=%p QUEUE" __FUNCTION__ "(%d) %d %d %d %p\n", sk, __LINE__, sk->state, __tp->packets_out, skb_queue_len(&sk->write_queue), __tp->send_head); \
- } \
- } } while (0)
-#else
+static inline int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+ if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+ return 0;
+ if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+ return 0;
+
+ /* RST segments are not recommended to carry timestamp,
+ and, if they do, it is recommended to ignore PAWS because
+ "their cleanup function should take precedence over timestamps."
+ Certainly, it is mistake. It is necessary to understand the reasons
+ of this constraint to relax it: if peer reboots, clock may go
+ out-of-sync and half-open connections will not be reset.
+ Actually, the problem would be not existing if all
+ the implementations followed draft about maintaining clock
+ via reboots. Linux-2.2 DOES NOT!
+
+ However, we can relax time bounds for RST segments to MSL.
+ */
+ if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+ return 0;
+ return 1;
+}
+
#define TCP_CHECK_TIMER(sk) do { } while (0);
-#endif
#endif /* _TCP_H */