diff options
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r-- | include/net/tcp.h | 607 |
1 files changed, 376 insertions, 231 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df845895..d3a63962c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -19,9 +19,13 @@ #define _TCP_H #define TCP_DEBUG 1 +#define FASTRETRANS_DEBUG 1 + +/* Be paranoid about data immediately beyond right edge of window. */ #undef TCP_FORMAL_WINDOW -#define TCP_MORE_COARSE_ACKS -#undef TCP_LESS_COARSE_ACKS + +/* Cancel timers, when they are not required. */ +#undef TCP_CLEAR_TIMERS #include <linux/config.h> #include <linux/tcp.h> @@ -173,7 +177,7 @@ extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw) } extern atomic_t tcp_orphan_count; -extern int tcp_tw_count; +extern int tcp_tw_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); @@ -242,12 +246,14 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) /* Minimal RCV_MSS. */ #define TCP_MIN_RCVMSS 536 -/* - * How much of the receive buffer do we advertize - * (the rest is reserved for headers and driver packet overhead) - * Use a power of 2. - */ -#define TCP_WINDOW_ADVERTISE_DIVISOR 2 +/* After receiving this amount of duplicate ACKs fast retransmit starts. */ +#define TCP_FASTRETRANS_THRESH 3 + +/* Maximal reordering. */ +#define TCP_MAX_REORDERING 127 + +/* Maximal number of ACKs sent quickly to accelerate slow-start. */ +#define TCP_MAX_QUICKACKS 16 /* urg_data states */ #define TCP_URG_VALID 0x0100 @@ -292,7 +298,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define TCP_DELACK_MAX (HZ/5) /* maximal time to delay before sending an ACK */ #define TCP_DELACK_MIN (2) /* minimal time to delay before sending an ACK, * 2 scheduler ticks, not depending on HZ. */ -#define TCP_ATO_MAX (HZ/2) /* Clamp ATO estimator at his value. */ #define TCP_ATO_MIN 2 #define TCP_RTO_MAX (120*HZ) #define TCP_RTO_MIN (HZ/5) @@ -414,6 +419,19 @@ extern int sysctl_tcp_tw_recycle; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_max_tw_buckets; +extern int sysctl_tcp_fack; +extern int sysctl_tcp_reordering; +extern int sysctl_tcp_ecn; +extern int sysctl_tcp_dsack; +extern int sysctl_tcp_mem[3]; +extern int sysctl_tcp_wmem[3]; +extern int sysctl_tcp_rmem[3]; +extern int sysctl_tcp_app_win; +extern int sysctl_tcp_adv_win_scale; + +extern atomic_t tcp_memory_allocated; +extern atomic_t tcp_sockets_allocated; +extern int tcp_memory_pressure; struct open_request; @@ -606,6 +624,23 @@ extern int tcp_rcv_established(struct sock *sk, struct tcphdr *th, unsigned len); +enum tcp_ack_state_t +{ + TCP_ACK_SCHED = 1, + TCP_ACK_TIMER = 2, + TCP_ACK_PUSHED= 4 +}; + +static inline void tcp_schedule_ack(struct tcp_opt *tp) +{ + tp->ack.pending |= TCP_ACK_SCHED; +} + +static inline int tcp_ack_scheduled(struct tcp_opt *tp) +{ + return tp->ack.pending&TCP_ACK_SCHED; +} + static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp) { if (tp->ack.quick && --tp->ack.quick == 0) { @@ -614,11 +649,27 @@ static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp) } } +extern void tcp_enter_quickack_mode(struct tcp_opt *tp); + static __inline__ void tcp_delack_init(struct tcp_opt *tp) { memset(&tp->ack, 0, sizeof(tp->ack)); } +enum tcp_ca_state +{ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1<<TCP_CA_Open) + TCP_CA_Disorder = 1, +#define TCPF_CA_Disorder (1<<TCP_CA_Disorder) + TCP_CA_CWR = 2, +#define TCPF_CA_CWR (1<<TCP_CA_CWR) + TCP_CA_Recovery = 3, +#define TCPF_CA_Recovery (1<<TCP_CA_Recovery) + TCP_CA_Loss = 4 +#define TCPF_CA_Loss (1<<TCP_CA_Loss) +}; + enum tcp_tw_status { @@ -640,6 +691,9 @@ extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, extern int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); +extern void tcp_enter_loss(struct sock *sk, int how); +extern void tcp_clear_retrans(struct tcp_opt *tp); +extern void tcp_update_metrics(struct sock *sk); extern void tcp_close(struct sock *sk, long timeout); @@ -661,8 +715,8 @@ extern int tcp_recvmsg(struct sock *sk, extern int tcp_listen_start(struct sock *sk); -extern void tcp_parse_options(struct sock *sk, struct tcphdr *th, - struct tcp_opt *tp, int no_fancy); +extern void tcp_parse_options(struct sk_buff *skb, + struct tcp_opt *tp); /* * TCP v4 functions exported for the inet6 API @@ -720,7 +774,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, extern int tcp_write_xmit(struct sock *); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); -extern void tcp_fack_retransmit(struct sock *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); @@ -736,7 +789,6 @@ extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); /* tcp_timer.c */ -extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long); extern void tcp_init_xmit_timers(struct sock *); extern void tcp_clear_xmit_timers(struct sock *); @@ -744,6 +796,76 @@ extern void tcp_delete_keepalive_timer (struct sock *); extern void tcp_reset_keepalive_timer (struct sock *, unsigned long); extern int tcp_sync_mss(struct sock *sk, u32 pmtu); +extern const char timer_bug_msg[]; + + +static inline void tcp_clear_xmit_timer(struct sock *sk, int what) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + switch (what) { + case TCP_TIME_RETRANS: + case TCP_TIME_PROBE0: + tp->pending = 0; + +#ifdef TCP_CLEAR_TIMERS + if (timer_pending(&tp->retransmit_timer) && + del_timer(&tp->retransmit_timer)) + __sock_put(sk); +#endif + break; + case TCP_TIME_DACK: + tp->ack.blocked = 0; + tp->ack.pending = 0; + +#ifdef TCP_CLEAR_TIMERS + if (timer_pending(&tp->delack_timer) && + del_timer(&tp->delack_timer)) + __sock_put(sk); +#endif + break; + default: + printk(timer_bug_msg); + return; + }; + +} + +/* + * Reset the retransmission timer + */ +static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (when > TCP_RTO_MAX) { +#ifdef TCP_DEBUG + printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); +#endif + when = TCP_RTO_MAX; + } + + switch (what) { + case TCP_TIME_RETRANS: + case TCP_TIME_PROBE0: + tp->pending = what; + tp->timeout = jiffies+when; + if (!mod_timer(&tp->retransmit_timer, tp->timeout)) + sock_hold(sk); + break; + + case TCP_TIME_DACK: + tp->ack.pending |= TCP_ACK_TIMER; + tp->ack.timeout = jiffies+when; + if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) + sock_hold(sk); + break; + + default: + printk(KERN_DEBUG "bug: unknown timer value\n"); + }; +} + /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ @@ -757,9 +879,9 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk) if (dst && dst->pmtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, dst->pmtu); - if(tp->sack_ok && tp->num_sacks) + if (tp->eff_sacks) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } @@ -774,15 +896,8 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk) extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct dst_entry *dst = __sk_dst_get(sk); - int mss; - if (dst) - mss = dst->advmss; - else - mss = tp->mss_cache; - - tp->ack.rcv_mss = max(min(mss, TCP_MIN_RCVMSS), TCP_MIN_MSS); + tp->ack.rcv_mss = max(min(tp->advmss, TCP_MIN_RCVMSS), TCP_MIN_MSS); } static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd) @@ -797,9 +912,6 @@ static __inline__ void tcp_fast_path_on(struct tcp_opt *tp) __tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale); } - - - /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. @@ -819,52 +931,6 @@ static __inline__ u32 tcp_receive_window(struct tcp_opt *tp) */ extern u32 __tcp_select_window(struct sock *sk); -/* Chose a new window to advertise, update state in tcp_opt for the - * socket, and return result with RFC1323 scaling applied. The return - * value can be stuffed directly into th->window for an outgoing - * frame. - */ -extern __inline__ u16 tcp_select_window(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); - - /* Never shrink the offered window */ - if(new_win < cur_win) { - /* Danger Will Robinson! - * Don't update rcv_wup/rcv_wnd here or else - * we will not be able to advertise a zero - * window in time. --DaveM - * - * Relax Will Robinson. - */ - new_win = cur_win; - } - tp->rcv_wnd = new_win; - tp->rcv_wup = tp->rcv_nxt; - - /* RFC1323 scaling applied */ - new_win >>= tp->rcv_wscale; - -#ifdef TCP_FORMAL_WINDOW - if (new_win == 0) { - /* If we advertise zero window, disable fast path. */ - tp->pred_flags = 0; - } else if (cur_win == 0 && tp->pred_flags == 0 && - skb_queue_len(&tp->out_of_order_queue) == 0 && - !tp->urg_data) { - /* If we open zero window, enable fast path. - Without this it will be open by the first data packet, - it is too late to merge checksumming to copy. - */ - tcp_fast_path_on(tp); - } -#endif - - return new_win; -} - /* TCP timestamps are only 32-bits, this causes a slight * complication on 64-bit systems since we store a snapshot * of jiffies in the buffer control blocks below. We decidely @@ -907,6 +973,12 @@ struct tcp_skb_cb { __u8 sacked; /* State flags for SACK/FACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ +#define TCPCB_LOST 0x04 /* SKB is lost */ +#define TCPCB_TAGBITS 0x07 /* All tag bits */ + +#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ +#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) + __u16 urg_ptr; /* Valid w/URG flags is set. */ __u32 ack_seq; /* Sequence number ACK'd */ @@ -914,11 +986,28 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +#define for_retrans_queue(skb, sk, tp) \ + for (skb = (sk)->write_queue.next; \ + (skb != (tp)->send_head) && \ + (skb != (struct sk_buff *)&(sk)->write_queue); \ + skb=skb->next) + + +#include <net/tcp_ecn.h> + + /* * Compute minimal free write space needed to queue new packets. */ -#define tcp_min_write_space(__sk) \ - (atomic_read(&(__sk)->wmem_alloc) / 2) +static inline int tcp_min_write_space(struct sock *sk) +{ + return sk->wmem_queued/2; +} + +static inline int tcp_wspace(struct sock *sk) +{ + return sk->sndbuf - sk->wmem_queued; +} /* This determines how many packets are "in the network" to the best @@ -932,89 +1021,97 @@ struct tcp_skb_cb { * Read this equation as: * * "Packets sent once on transmission queue" MINUS - * "Packets acknowledged by FACK information" PLUS + * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static __inline__ int tcp_packets_in_flight(struct tcp_opt *tp) { - return tp->packets_out - tp->fackets_out + tp->retrans_out; + return tp->packets_out - tp->left_out + tp->retrans_out; } /* Recalculate snd_ssthresh, we want to set it to: * * one half the current congestion window, but no * less than two segments - * - * We must take into account the current send window - * as well, however we keep track of that using different - * units so a conversion is necessary. -DaveM - * - * RED-PEN. - * RFC 2581: "an easy mistake to make is to simply use cwnd, - * rather than FlightSize" - * I see no references to FlightSize here. snd_wnd is not FlightSize, - * it is also apriory characteristics. - * - * FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ? */ extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp) { - u32 FlightSize = (tp->snd_nxt - tp->snd_una)/tp->mss_cache; + return max(tp->snd_cwnd>>1, 2); +} + +/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. + * The exception is rate halving phase, when cwnd is decreasing towards + * ssthresh. + */ +extern __inline__ __u32 tcp_current_ssthresh(struct tcp_opt *tp) +{ + if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery)) + return tp->snd_ssthresh; + else + return max(tp->snd_ssthresh, (tp->snd_cwnd>>1)+(tp->snd_cwnd>>2)); +} + +extern void tcp_cwnd_application_limited(struct sock *sk); - FlightSize = min(FlightSize, tcp_packets_in_flight(tp)); +/* Congestion window validation. (RFC2861) */ - return max(min(FlightSize, tp->snd_cwnd) >> 1, 2); +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp) +{ + if (tp->packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); + } } /* Set slow start threshould and cwnd not falling to slow start */ -extern __inline__ void __tcp_enter_cong_avoid(struct tcp_opt *tp) +extern __inline__ void __tcp_enter_cwr(struct tcp_opt *tp) { + tp->undo_marker = 0; tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); } -extern __inline__ void tcp_enter_cong_avoid(struct tcp_opt *tp) +extern __inline__ void tcp_enter_cwr(struct tcp_opt *tp) { - if (!tp->high_seq || after(tp->snd_nxt, tp->high_seq)) - __tcp_enter_cong_avoid(tp); + tp->prior_ssthresh = 0; + if (tp->ca_state < TCP_CA_CWR) { + __tcp_enter_cwr(tp); + tp->ca_state = TCP_CA_CWR; + } } +extern __u32 tcp_init_cwnd(struct tcp_opt *tp); -/* Increase initial CWND conservatively, i.e. only if estimated - RTT is low enough. It is not quite correct, we should use - POWER i.e. RTT*BANDWIDTH, but we still cannot estimate this. - - Numbers are taken from RFC1414. +/* Slow start with delack produces 3 packets of burst, so that + * it is safe "de facto". */ -static __inline__ __u32 tcp_init_cwnd(struct tcp_opt *tp) +static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp) { - __u32 cwnd; - - if (!tp->srtt || tp->srtt > ((HZ/50)<<3) || tp->mss_cache > 1460) - cwnd = 2; - else if (tp->mss_cache > 1095) - cwnd = 3; - else - cwnd = 4; - - return min(cwnd, tp->snd_cwnd_clamp); + return 3; } - static __inline__ int tcp_minshall_check(struct tcp_opt *tp) { return after(tp->snd_sml,tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } -static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, int len) +static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb) { - if (len < mss) - tp->snd_sml = tp->snd_nxt; + if (skb->len < mss) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return 0, if packet can be sent now without violation Nagle's rules: @@ -1041,17 +1138,6 @@ static __inline__ int tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, u static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, unsigned cur_mss, int tail) { - /* - * Reset CWND after idle period longer RTO to "restart window". - * It is "side" effect of the function, which is _not_ good - * from viewpoint of clarity. But we have to make it before - * checking congestion window below. Alternative is to prepend - * all the calls with this test. - */ - if (tp->packets_out==0 && - (s32)(tcp_time_stamp - tp->lsndtime) > tp->rto) - tp->snd_cwnd = min(tp->snd_cwnd, tcp_init_cwnd(tp)); - /* RFC 1122 - section 4.2.3.4 * * We must queue if @@ -1062,8 +1148,7 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, * (part of SWS is done on packetization) * Minshall version sounds: there are no _small_ * segments in flight. (tcp_nagle_check) - * c) We are retransmiting [Nagle] - * d) We have too many packets 'in flight' + * c) We have too many packets 'in flight' * * Don't use the nagle rule for urgent data (or * for the final FIN -DaveM). @@ -1081,13 +1166,12 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, skb_tailroom(skb) < 32) && ((tcp_packets_in_flight(tp) < tp->snd_cwnd) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tp->retransmits == 0); + !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); } static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp) { - if (!tp->packets_out && !timer_pending(&tp->probe_timer)) + if (!tp->packets_out && !tp->pending) tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); } @@ -1111,6 +1195,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk, tcp_write_xmit(sk)) tcp_check_probe_timer(sk, tp); } + tcp_cwnd_validate(sk, tp); } static __inline__ void tcp_push_pending_frames(struct sock *sk, @@ -1119,6 +1204,24 @@ static __inline__ void tcp_push_pending_frames(struct sock *sk, __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk)); } +static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb = tp->send_head; + + return (skb && + tcp_snd_test(tp, skb, tcp_current_mss(sk), tcp_skb_is_last(sk, skb))); +} + +static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq) +{ + tp->snd_wl1 = seq; +} + +static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq) +{ + tp->snd_wl1 = seq; +} + extern void tcp_destroy_sock(struct sock *sk); @@ -1143,7 +1246,6 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb) __tcp_checksum_complete(skb); } - /* Prequeue for VJ style copy to user, combined with checksumming. */ static __inline__ void tcp_prequeue_init(struct tcp_opt *tp) @@ -1167,12 +1269,15 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (tp->ucopy.task) { if ((tp->ucopy.memory += skb->truesize) <= (sk->rcvbuf<<1)) { __skb_queue_tail(&tp->ucopy.prequeue, skb); - if (skb_queue_len(&tp->ucopy.prequeue) == 1) + if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sleep); + if (!tcp_ack_scheduled(tp)) + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + } } else { NET_INC_STATS_BH(TCPPrequeueDropped); tp->ucopy.memory -= skb->truesize; - kfree_skb(skb); + __kfree_skb(skb); } return 1; } @@ -1231,6 +1336,13 @@ static __inline__ void tcp_done(struct sock *sk) tcp_destroy_sock(sk); } +static __inline__ void tcp_sack_reset(struct tcp_opt *tp) +{ + tp->dsack = 0; + tp->eff_sacks = 0; + tp->num_sacks = 0; +} + static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp) { if (tp->tstamp_ok) { @@ -1241,17 +1353,22 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt * *ptr++ = htonl(tstamp); *ptr++ = htonl(tp->ts_recent); } - if(tp->sack_ok && tp->num_sacks) { + if (tp->eff_sacks) { + struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->num_sacks; this_sack++) { - *ptr++ = htonl(tp->selective_acks[this_sack].start_seq); - *ptr++ = htonl(tp->selective_acks[this_sack].end_seq); + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->dsack) { + tp->dsack = 0; + tp->eff_sacks--; } } } @@ -1330,42 +1447,44 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss, space >>= 1; (*rcv_wscale)++; } + if (*rcv_wscale && sysctl_tcp_app_win && space>=mss && + space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2) + (*rcv_wscale)--; + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; } /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp); } +static inline int tcp_win_from_space(int space) +{ + return sysctl_tcp_adv_win_scale<=0 ? + (space>>(-sysctl_tcp_adv_win_scale)) : + space - (space>>sysctl_tcp_adv_win_scale); +} + /* Note: caller must be prepared to deal with negative returns */ extern __inline__ int tcp_space(struct sock *sk) { - return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / - TCP_WINDOW_ADVERTISE_DIVISOR; + return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc)); } extern __inline__ int tcp_full_space( struct sock *sk) { - return sk->rcvbuf / TCP_WINDOW_ADVERTISE_DIVISOR; -} - -extern __inline__ void tcp_init_buffer_space(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int rcvbuf = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff); - int sndbuf = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); - - if (sk->rcvbuf < 3*rcvbuf) - sk->rcvbuf = min (3*rcvbuf, sysctl_rmem_max); - - /* Reserve slack space to reduce jitter of advertised window. */ - if (tp->window_clamp >= tcp_full_space(sk)) { - int nwin = tcp_full_space(sk) - tp->mss_clamp; - - if (nwin >= MAX_TCP_WINDOW && nwin >= 2*tp->advmss) - tp->window_clamp = nwin; - } - - if (sk->sndbuf < 3*sndbuf) - sk->sndbuf = min (3*sndbuf, sysctl_wmem_max); + return tcp_win_from_space(sk->rcvbuf); } extern __inline__ void tcp_acceptq_removed(struct sock *sk) @@ -1473,61 +1592,85 @@ static __inline__ void tcp_openreq_init(struct open_request *req, req->snd_wscale = tp->snd_wscale; req->wscale_ok = tp->wscale_ok; req->acked = 0; + req->ecn_ok = 0; req->rmt_port = skb->h.th->source; } -extern const char timer_bug_msg[]; +#define TCP_MEM_QUANTUM ((int)PAGE_SIZE) -static inline void tcp_clear_xmit_timer(struct sock *sk, int what) +static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct timer_list *timer; - - switch (what) { - case TCP_TIME_RETRANS: - timer = &tp->retransmit_timer; - break; - case TCP_TIME_DACK: - tp->ack.blocked = 0; - timer = &tp->delack_timer; - break; - case TCP_TIME_PROBE0: - timer = &tp->probe_timer; - break; - default: - printk(timer_bug_msg); - return; - }; + sk->tp_pinfo.af_tcp.queue_shrunk = 1; + sk->wmem_queued -= skb->truesize; + sk->forward_alloc += skb->truesize; + __kfree_skb(skb); +} - if (timer_pending(timer) && del_timer(timer)) - __sock_put(sk); +static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb) +{ + sk->wmem_queued += skb->truesize; + sk->forward_alloc -= skb->truesize; } -/* This function does not return reliable answer. Use it only as advice. - */ +extern void __tcp_mem_reclaim(struct sock *sk); +extern int tcp_mem_schedule(struct sock *sk, int size, int kind); -static inline int tcp_timer_is_set(struct sock *sk, int what) +static inline void tcp_mem_reclaim(struct sock *sk) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int ret; + if (sk->forward_alloc >= TCP_MEM_QUANTUM) + __tcp_mem_reclaim(sk); +} - switch (what) { - case TCP_TIME_RETRANS: - ret = timer_pending(&tp->retransmit_timer); - break; - case TCP_TIME_DACK: - ret = timer_pending(&tp->delack_timer); - break; - case TCP_TIME_PROBE0: - ret = timer_pending(&tp->probe_timer); - break; - default: - ret = 0; - printk(timer_bug_msg); - }; - return ret; +static inline void tcp_enter_memory_pressure(void) +{ + if (!tcp_memory_pressure) { + NET_INC_STATS(TCPMemoryPressures); + tcp_memory_pressure = 1; + } } +static inline void tcp_moderate_sndbuf(struct sock *sk) +{ + if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) { + sk->sndbuf = min(sk->sndbuf, sk->wmem_queued/2); + sk->sndbuf = max(sk->sndbuf, SOCK_MIN_SNDBUF); + } +} + +static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp) +{ + struct sk_buff *skb = alloc_skb(size, gfp); + + if (skb) { + if (sk->forward_alloc >= (int)skb->truesize || + tcp_mem_schedule(sk, skb->truesize, 0)) + return skb; + __kfree_skb(skb); + } else { + tcp_enter_memory_pressure(); + tcp_moderate_sndbuf(sk); + } + return NULL; +} + +static inline void tcp_writequeue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->write_queue)) != NULL) + tcp_free_skb(sk, skb); + tcp_mem_reclaim(sk); +} + +extern void tcp_rfree(struct sk_buff *skb); + +static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb->sk = sk; + skb->destructor = tcp_rfree; + atomic_add(skb->truesize, &sk->rmem_alloc); + sk->forward_alloc -= skb->truesize; +} extern void tcp_listen_wlock(void); @@ -1570,28 +1713,30 @@ static inline int tcp_fin_time(struct tcp_opt *tp) return fin_timeout; } -#if 0 /* TCP_DEBUG */ -#define TCP_CHECK_TIMER(sk) \ -do { struct tcp_opt *__tp = &sk->tp_pinfo.af_tcp; \ - if (sk->state != TCP_CLOSE) { \ - if (__tp->packets_out) { \ - if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS) && !timer_is_running(&__tp->retransmit_timer) && net_ratelimit()) \ - printk(KERN_DEBUG "sk=%p RETRANS" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ - } else if (__tp->send_head) { \ - if (!tcp_timer_is_set(sk, TCP_TIME_PROBE0) && !timer_is_running(&__tp->probe_timer) && net_ratelimit()) \ - printk(KERN_DEBUG "sk=%p PROBE0" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ - } \ - if (__tp->ack.pending) { \ - if (!tcp_timer_is_set(sk, TCP_TIME_DACK) && !timer_is_running(&__tp->delack_timer) && net_ratelimit()) \ - printk(KERN_DEBUG "sk=%p DACK" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ - } \ - if (__tp->packets_out > skb_queue_len(&sk->write_queue) || \ - (__tp->send_head && skb_queue_len(&sk->write_queue) == 0)) { \ - printk(KERN_DEBUG "sk=%p QUEUE" __FUNCTION__ "(%d) %d %d %d %p\n", sk, __LINE__, sk->state, __tp->packets_out, skb_queue_len(&sk->write_queue), __tp->send_head); \ - } \ - } } while (0) -#else +static inline int tcp_paws_check(struct tcp_opt *tp, int rst) +{ + if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) + return 0; + if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + return 0; + + /* RST segments are not recommended to carry timestamp, + and, if they do, it is recommended to ignore PAWS because + "their cleanup function should take precedence over timestamps." + Certainly, it is mistake. It is necessary to understand the reasons + of this constraint to relax it: if peer reboots, clock may go + out-of-sync and half-open connections will not be reset. + Actually, the problem would be not existing if all + the implementations followed draft about maintaining clock + via reboots. Linux-2.2 DOES NOT! + + However, we can relax time bounds for RST segments to MSL. + */ + if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) + return 0; + return 1; +} + #define TCP_CHECK_TIMER(sk) do { } while (0); -#endif #endif /* _TCP_H */ |