summaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
Diffstat (limited to 'include/net')
-rw-r--r--include/net/dst.h1
-rw-r--r--include/net/snmp.h40
-rw-r--r--include/net/sock.h118
-rw-r--r--include/net/tcp.h607
-rw-r--r--include/net/tcp_ecn.h155
5 files changed, 642 insertions, 279 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index 4bca9c092..253d72a22 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -44,6 +44,7 @@ struct dst_entry
unsigned ssthresh;
unsigned cwnd;
unsigned advmss;
+ unsigned reordering;
unsigned long rate_last; /* rate limiting for ICMP */
unsigned long rate_tokens;
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 8bcb17085..2bd127299 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -199,7 +199,45 @@ struct linux_mib
unsigned long TCPPrequeueDropped;
unsigned long TCPHPHits;
unsigned long TCPHPHitsToUser;
- unsigned long __pad[32-26];
+ unsigned long TCPPureAcks;
+ unsigned long TCPHPAcks;
+ unsigned long TCPRenoRecovery;
+ unsigned long TCPSackRecovery;
+ unsigned long TCPSACKReneging;
+ unsigned long TCPFACKReorder;
+ unsigned long TCPSACKReorder;
+ unsigned long TCPRenoReorder;
+ unsigned long TCPTSReorder;
+ unsigned long TCPFullUndo;
+ unsigned long TCPPartialUndo;
+ unsigned long TCPDSACKUndo;
+ unsigned long TCPLossUndo;
+ unsigned long TCPLoss;
+ unsigned long TCPLostRetransmit;
+ unsigned long TCPRenoFailures;
+ unsigned long TCPSackFailures;
+ unsigned long TCPLossFailures;
+ unsigned long TCPFastRetrans;
+ unsigned long TCPForwardRetrans;
+ unsigned long TCPSlowStartRetrans;
+ unsigned long TCPTimeouts;
+ unsigned long TCPRenoRecoveryFail;
+ unsigned long TCPSackRecoveryFail;
+ unsigned long TCPSchedulerFailed;
+ unsigned long TCPRcvCollapsed;
+ unsigned long TCPDSACKOldSent;
+ unsigned long TCPDSACKOfoSent;
+ unsigned long TCPDSACKRecv;
+ unsigned long TCPDSACKOfoRecv;
+ unsigned long TCPAbortOnSyn;
+ unsigned long TCPAbortOnData;
+ unsigned long TCPAbortOnClose;
+ unsigned long TCPAbortOnMemory;
+ unsigned long TCPAbortOnTimeout;
+ unsigned long TCPAbortOnLinger;
+ unsigned long TCPAbortFailed;
+ unsigned long TCPMemoryPressures;
+ unsigned long __pad[64-64];
};
#define SNMP_INC_STATS(mib, field) ((mib)[2*smp_processor_id()+!in_softirq()].field++)
diff --git a/include/net/sock.h b/include/net/sock.h
index 87a8c3941..38b5549d6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -268,10 +268,10 @@ struct tcp_opt {
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock*/
__u32 ato; /* Predicted tick of soft clock */
+ unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet*/
- __u16 last_seg_size; /* Size of last incoming segment */
- __u16 rcv_mss; /* MSS used for delayed ACK decisions */
- __u32 rcv_segs; /* Number of received segments since last ack */
+ __u16 last_seg_size; /* Size of last incoming segment */
+ __u16 rcv_mss; /* MSS used for delayed ACK decisions */
} ack;
/* Data for direct copy to user */
@@ -284,19 +284,18 @@ struct tcp_opt {
} ucopy;
__u32 snd_wl1; /* Sequence for window update */
- __u32 snd_wl2; /* Ack sequence for update */
__u32 snd_wnd; /* The window we expect to receive */
__u32 max_window; /* Maximal window ever seen from peer */
__u32 pmtu_cookie; /* Last pmtu seen by socket */
__u16 mss_cache; /* Cached effective mss, not including SACKS */
__u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
- __u8 dup_acks; /* Consecutive duplicate acks seen from other end */
- __u8 retransmits;
+ __u8 ca_state; /* State of fast-retransmit machine */
+ __u8 retransmits; /* Number of unrecovered RTO timeouts. */
- __u8 __empty1;
- __u8 sorry;
- __u8 defer_accept;
+ __u8 reordering; /* Packet reordering metric. */
+ __u8 queue_shrunk; /* Write queue has been shrunk recently.*/
+ __u8 defer_accept; /* User waits for some data after accept() */
/* RTT measurement */
__u8 backoff; /* backoff */
@@ -305,9 +304,9 @@ struct tcp_opt {
__u32 rto; /* retransmit timeout */
__u32 packets_out; /* Packets which are "in flight" */
- __u32 fackets_out; /* Non-retrans SACK'd packets */
- __u32 retrans_out; /* Fast-retransmitted packets out */
- __u32 high_seq; /* snd_nxt at onset of congestion */
+ __u32 left_out; /* Packets which leaved network */
+ __u32 retrans_out; /* Retransmitted packets out */
+
/*
* Slow start and congestion control (see also Nagle, and Karn & Partridge)
@@ -316,12 +315,11 @@ struct tcp_opt {
__u32 snd_cwnd; /* Sending congestion window */
__u16 snd_cwnd_cnt; /* Linear increase counter */
__u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
-
- __u8 nonagle; /* Disable Nagle algorithm? */
- __u8 syn_retries; /* num of allowed syn retries */
- __u16 user_mss; /* mss requested by user in ioctl */
+ __u32 snd_cwnd_used;
+ __u32 snd_cwnd_stamp;
/* Two commonly used timers in both sender and receiver paths. */
+ unsigned long timeout;
struct timer_list retransmit_timer; /* Resend (no ack) */
struct timer_list delack_timer; /* Ack delay */
@@ -329,16 +327,12 @@ struct tcp_opt {
struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
struct sk_buff *send_head; /* Front of stuff to transmit */
- struct sk_buff *retrans_head; /* retrans head can be
- * different to the head of
- * write queue if we are doing
- * fast retransmit
- */
__u32 rcv_wnd; /* Current receiver window */
__u32 rcv_wup; /* rcv_nxt on last window update sent */
- __u32 write_seq;
- __u32 copied_seq;
+ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
+ __u32 pushed_seq; /* Last pushed seq, required to talk to windows */
+ __u32 copied_seq; /* Head of yet unread data */
/*
* Options received (usually on last packet, some only on SYN packets).
*/
@@ -348,7 +342,7 @@ struct tcp_opt {
char saw_tstamp; /* Saw TIMESTAMP on last packet */
__u8 snd_wscale; /* Window scaling received from sender */
__u8 rcv_wscale; /* Window scaling to send to receiver */
- __u8 rexmt_done; /* Retransmitted up to send head? */
+ __u8 nonagle; /* Disable Nagle algorithm? */
__u8 keepalive_probes; /* num of allowed keep alive probes */
/* PAWS/RTTM data */
@@ -358,19 +352,37 @@ struct tcp_opt {
long ts_recent_stamp;/* Time we stored ts_recent (for aging) */
/* SACKs data */
+ __u16 user_mss; /* mss requested by user in ioctl */
+ __u8 dsack; /* D-SACK is scheduled */
+ __u8 eff_sacks; /* Size of SACK array to send with next packet */
+ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
- struct timer_list probe_timer; /* Probes */
__u32 window_clamp; /* Maximal window to advertise */
+ __u32 rcv_ssthresh; /* Current window clamp */
__u8 probes_out; /* unanswered 0 window probes */
__u8 num_sacks; /* Number of SACK blocks */
__u16 advmss; /* Advertised MSS */
- __u32 syn_stamp;
- __u32 syn_seq;
- __u32 fin_seq;
- __u32 urg_seq;
- __u32 urg_data;
+ __u8 syn_retries; /* num of allowed syn retries */
+ __u8 ecn_flags; /* ECN status bits. */
+ __u16 prior_ssthresh; /* ssthresh saved at recovery start */
+ __u32 lost_out; /* Lost packets */
+ __u32 sacked_out; /* SACK'd packets */
+ __u32 fackets_out; /* FACK'd packets */
+ __u32 high_seq; /* snd_nxt at onset of congestion */
+
+ __u32 retrans_stamp; /* Timestamp of the last retransmit,
+ * also used in SYN-SENT to remember stamp of
+ * the first SYN. */
+ __u32 undo_marker; /* tracking retrans started here. */
+ int undo_retrans; /* number of undoable retransmissions. */
+ __u32 syn_seq; /* Seq of received SYN. */
+ __u32 fin_seq; /* Seq of received FIN. */
+ __u32 urg_seq; /* Seq of received urgent pointer */
+ __u16 urg_data; /* Saved octet of OOB data and control flags */
+ __u8 pending; /* Scheduled timer event */
+ __u8 __empty;
/* The syn_wait_lock is necessary only to avoid tcp_get_info having
* to grab the main lock sock while browsing the listening hash
@@ -482,8 +494,8 @@ struct sock {
__u16 sport; /* Source port */
unsigned short family; /* Address family */
- unsigned char reuse, /* SO_REUSEADDR setting */
- __unused;
+ unsigned char reuse; /* SO_REUSEADDR setting */
+ unsigned char shutdown;
atomic_t refcnt; /* Reference count */
socket_lock_t lock; /* Synchronizer... */
@@ -497,6 +509,8 @@ struct sock {
atomic_t wmem_alloc; /* Transmit queue bytes committed */
struct sk_buff_head write_queue; /* Packet sending queue */
atomic_t omem_alloc; /* "o" is "option" or "other" */
+ int wmem_queued; /* Persistent queue size */
+ int forward_alloc; /* Space allocated forward. */
__u32 saddr; /* Sending source */
unsigned int allocation; /* Allocation mode */
int sndbuf; /* Size of send buffer in bytes */
@@ -539,8 +553,6 @@ struct sock {
struct proto *prot;
- unsigned short shutdown;
-
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
union {
struct ipv6_pinfo af_inet6;
@@ -734,6 +746,11 @@ static void __inline__ sock_prot_dec_use(struct proto *prot)
#define RCV_SHUTDOWN 1
#define SEND_SHUTDOWN 2
+#define SOCK_SNDBUF_LOCK 1
+#define SOCK_RCVBUF_LOCK 2
+#define SOCK_BINDADDR_LOCK 4
+
+
/* Used by processes to "lock" a socket state, so that
* interrupts and bottom half handlers won't change it
* from under us. It essentially blocks any incoming
@@ -801,8 +818,6 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk,
int priority);
extern void sock_wfree(struct sk_buff *skb);
extern void sock_rfree(struct sk_buff *skb);
-extern void sock_cfree(struct sk_buff *skb);
-extern unsigned long sock_rspace(struct sock *sk);
extern unsigned long sock_wspace(struct sock *sk);
extern int sock_setsockopt(struct socket *sock, int level,
@@ -982,8 +997,6 @@ extern __inline__ void sock_put(struct sock *sk)
* we do not release it in this function, because protocol
* probably wants some additional cleanups or even continuing
* to work with this socket (TCP).
- *
- * NOTE: When softnet goes in replace _irq with _bh!
*/
extern __inline__ void sock_orphan(struct sock *sk)
{
@@ -1003,6 +1016,25 @@ extern __inline__ void sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->callback_lock);
}
+static inline int sock_i_uid(struct sock *sk)
+{
+ int uid;
+
+ read_lock(&sk->callback_lock);
+ uid = sk->socket ? sk->socket->inode->i_uid : 0;
+ read_unlock(&sk->callback_lock);
+ return uid;
+}
+
+static inline unsigned long sock_i_ino(struct sock *sk)
+{
+ unsigned long ino;
+
+ read_lock(&sk->callback_lock);
+ ino = sk->socket ? sk->socket->inode->i_ino : 0;
+ read_unlock(&sk->callback_lock);
+ return ino;
+}
extern __inline__ struct dst_entry *
__sk_dst_get(struct sock *sk)
@@ -1110,14 +1142,6 @@ extern __inline__ void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
atomic_add(skb->truesize, &sk->rmem_alloc);
}
-extern __inline__ void skb_set_owner_c(struct sk_buff *skb, struct sock *sk)
-{
- sock_hold(sk);
- skb->sk = sk;
- skb->destructor = sock_cfree;
-}
-
-
extern __inline__ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
@@ -1194,7 +1218,7 @@ extern __inline__ void sk_wake_async(struct sock *sk, int how, int band)
}
#define SOCK_MIN_SNDBUF 2048
-#define SOCK_MIN_RCVBUF 128
+#define SOCK_MIN_RCVBUF 256
/* Must be less or equal SOCK_MIN_SNDBUF */
#define SOCK_MIN_WRITE_SPACE SOCK_MIN_SNDBUF
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7df845895..d3a63962c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -19,9 +19,13 @@
#define _TCP_H
#define TCP_DEBUG 1
+#define FASTRETRANS_DEBUG 1
+
+/* Be paranoid about data immediately beyond right edge of window. */
#undef TCP_FORMAL_WINDOW
-#define TCP_MORE_COARSE_ACKS
-#undef TCP_LESS_COARSE_ACKS
+
+/* Cancel timers, when they are not required. */
+#undef TCP_CLEAR_TIMERS
#include <linux/config.h>
#include <linux/tcp.h>
@@ -173,7 +177,7 @@ extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw)
}
extern atomic_t tcp_orphan_count;
-extern int tcp_tw_count;
+extern int tcp_tw_count;
extern void tcp_time_wait(struct sock *sk, int state, int timeo);
extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
@@ -242,12 +246,14 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
/* Minimal RCV_MSS. */
#define TCP_MIN_RCVMSS 536
-/*
- * How much of the receive buffer do we advertize
- * (the rest is reserved for headers and driver packet overhead)
- * Use a power of 2.
- */
-#define TCP_WINDOW_ADVERTISE_DIVISOR 2
+/* After receiving this amount of duplicate ACKs fast retransmit starts. */
+#define TCP_FASTRETRANS_THRESH 3
+
+/* Maximal reordering. */
+#define TCP_MAX_REORDERING 127
+
+/* Maximal number of ACKs sent quickly to accelerate slow-start. */
+#define TCP_MAX_QUICKACKS 16
/* urg_data states */
#define TCP_URG_VALID 0x0100
@@ -292,7 +298,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
#define TCP_DELACK_MAX (HZ/5) /* maximal time to delay before sending an ACK */
#define TCP_DELACK_MIN (2) /* minimal time to delay before sending an ACK,
* 2 scheduler ticks, not depending on HZ. */
-#define TCP_ATO_MAX (HZ/2) /* Clamp ATO estimator at his value. */
#define TCP_ATO_MIN 2
#define TCP_RTO_MAX (120*HZ)
#define TCP_RTO_MIN (HZ/5)
@@ -414,6 +419,19 @@ extern int sysctl_tcp_tw_recycle;
extern int sysctl_tcp_abort_on_overflow;
extern int sysctl_tcp_max_orphans;
extern int sysctl_tcp_max_tw_buckets;
+extern int sysctl_tcp_fack;
+extern int sysctl_tcp_reordering;
+extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_dsack;
+extern int sysctl_tcp_mem[3];
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+extern int sysctl_tcp_app_win;
+extern int sysctl_tcp_adv_win_scale;
+
+extern atomic_t tcp_memory_allocated;
+extern atomic_t tcp_sockets_allocated;
+extern int tcp_memory_pressure;
struct open_request;
@@ -606,6 +624,23 @@ extern int tcp_rcv_established(struct sock *sk,
struct tcphdr *th,
unsigned len);
+enum tcp_ack_state_t
+{
+ TCP_ACK_SCHED = 1,
+ TCP_ACK_TIMER = 2,
+ TCP_ACK_PUSHED= 4
+};
+
+static inline void tcp_schedule_ack(struct tcp_opt *tp)
+{
+ tp->ack.pending |= TCP_ACK_SCHED;
+}
+
+static inline int tcp_ack_scheduled(struct tcp_opt *tp)
+{
+ return tp->ack.pending&TCP_ACK_SCHED;
+}
+
static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
{
if (tp->ack.quick && --tp->ack.quick == 0) {
@@ -614,11 +649,27 @@ static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
}
}
+extern void tcp_enter_quickack_mode(struct tcp_opt *tp);
+
static __inline__ void tcp_delack_init(struct tcp_opt *tp)
{
memset(&tp->ack, 0, sizeof(tp->ack));
}
+enum tcp_ca_state
+{
+ TCP_CA_Open = 0,
+#define TCPF_CA_Open (1<<TCP_CA_Open)
+ TCP_CA_Disorder = 1,
+#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
+ TCP_CA_CWR = 2,
+#define TCPF_CA_CWR (1<<TCP_CA_CWR)
+ TCP_CA_Recovery = 3,
+#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
+ TCP_CA_Loss = 4
+#define TCPF_CA_Loss (1<<TCP_CA_Loss)
+};
+
enum tcp_tw_status
{
@@ -640,6 +691,9 @@ extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
extern int tcp_child_process(struct sock *parent,
struct sock *child,
struct sk_buff *skb);
+extern void tcp_enter_loss(struct sock *sk, int how);
+extern void tcp_clear_retrans(struct tcp_opt *tp);
+extern void tcp_update_metrics(struct sock *sk);
extern void tcp_close(struct sock *sk,
long timeout);
@@ -661,8 +715,8 @@ extern int tcp_recvmsg(struct sock *sk,
extern int tcp_listen_start(struct sock *sk);
-extern void tcp_parse_options(struct sock *sk, struct tcphdr *th,
- struct tcp_opt *tp, int no_fancy);
+extern void tcp_parse_options(struct sk_buff *skb,
+ struct tcp_opt *tp);
/*
* TCP v4 functions exported for the inet6 API
@@ -720,7 +774,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
extern int tcp_write_xmit(struct sock *);
extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
-extern void tcp_fack_retransmit(struct sock *);
extern void tcp_xmit_retransmit_queue(struct sock *);
extern void tcp_simple_retransmit(struct sock *);
@@ -736,7 +789,6 @@ extern void tcp_send_ack(struct sock *sk);
extern void tcp_send_delayed_ack(struct sock *sk);
/* tcp_timer.c */
-extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long);
extern void tcp_init_xmit_timers(struct sock *);
extern void tcp_clear_xmit_timers(struct sock *);
@@ -744,6 +796,76 @@ extern void tcp_delete_keepalive_timer (struct sock *);
extern void tcp_reset_keepalive_timer (struct sock *, unsigned long);
extern int tcp_sync_mss(struct sock *sk, u32 pmtu);
+extern const char timer_bug_msg[];
+
+
+static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ switch (what) {
+ case TCP_TIME_RETRANS:
+ case TCP_TIME_PROBE0:
+ tp->pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+ if (timer_pending(&tp->retransmit_timer) &&
+ del_timer(&tp->retransmit_timer))
+ __sock_put(sk);
+#endif
+ break;
+ case TCP_TIME_DACK:
+ tp->ack.blocked = 0;
+ tp->ack.pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+ if (timer_pending(&tp->delack_timer) &&
+ del_timer(&tp->delack_timer))
+ __sock_put(sk);
+#endif
+ break;
+ default:
+ printk(timer_bug_msg);
+ return;
+ };
+
+}
+
+/*
+ * Reset the retransmission timer
+ */
+static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ if (when > TCP_RTO_MAX) {
+#ifdef TCP_DEBUG
+ printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
+#endif
+ when = TCP_RTO_MAX;
+ }
+
+ switch (what) {
+ case TCP_TIME_RETRANS:
+ case TCP_TIME_PROBE0:
+ tp->pending = what;
+ tp->timeout = jiffies+when;
+ if (!mod_timer(&tp->retransmit_timer, tp->timeout))
+ sock_hold(sk);
+ break;
+
+ case TCP_TIME_DACK:
+ tp->ack.pending |= TCP_ACK_TIMER;
+ tp->ack.timeout = jiffies+when;
+ if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
+ sock_hold(sk);
+ break;
+
+ default:
+ printk(KERN_DEBUG "bug: unknown timer value\n");
+ };
+}
+
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
@@ -757,9 +879,9 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
if (dst && dst->pmtu != tp->pmtu_cookie)
mss_now = tcp_sync_mss(sk, dst->pmtu);
- if(tp->sack_ok && tp->num_sacks)
+ if (tp->eff_sacks)
mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
return mss_now;
}
@@ -774,15 +896,8 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct dst_entry *dst = __sk_dst_get(sk);
- int mss;
- if (dst)
- mss = dst->advmss;
- else
- mss = tp->mss_cache;
-
- tp->ack.rcv_mss = max(min(mss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
+ tp->ack.rcv_mss = max(min(tp->advmss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
}
static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd)
@@ -797,9 +912,6 @@ static __inline__ void tcp_fast_path_on(struct tcp_opt *tp)
__tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale);
}
-
-
-
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
@@ -819,52 +931,6 @@ static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
*/
extern u32 __tcp_select_window(struct sock *sk);
-/* Chose a new window to advertise, update state in tcp_opt for the
- * socket, and return result with RFC1323 scaling applied. The return
- * value can be stuffed directly into th->window for an outgoing
- * frame.
- */
-extern __inline__ u16 tcp_select_window(struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- u32 cur_win = tcp_receive_window(tp);
- u32 new_win = __tcp_select_window(sk);
-
- /* Never shrink the offered window */
- if(new_win < cur_win) {
- /* Danger Will Robinson!
- * Don't update rcv_wup/rcv_wnd here or else
- * we will not be able to advertise a zero
- * window in time. --DaveM
- *
- * Relax Will Robinson.
- */
- new_win = cur_win;
- }
- tp->rcv_wnd = new_win;
- tp->rcv_wup = tp->rcv_nxt;
-
- /* RFC1323 scaling applied */
- new_win >>= tp->rcv_wscale;
-
-#ifdef TCP_FORMAL_WINDOW
- if (new_win == 0) {
- /* If we advertise zero window, disable fast path. */
- tp->pred_flags = 0;
- } else if (cur_win == 0 && tp->pred_flags == 0 &&
- skb_queue_len(&tp->out_of_order_queue) == 0 &&
- !tp->urg_data) {
- /* If we open zero window, enable fast path.
- Without this it will be open by the first data packet,
- it is too late to merge checksumming to copy.
- */
- tcp_fast_path_on(tp);
- }
-#endif
-
- return new_win;
-}
-
/* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decidely
@@ -907,6 +973,12 @@ struct tcp_skb_cb {
__u8 sacked; /* State flags for SACK/FACK. */
#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
+#define TCPCB_LOST 0x04 /* SKB is lost */
+#define TCPCB_TAGBITS 0x07 /* All tag bits */
+
+#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
+#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+
__u16 urg_ptr; /* Valid w/URG flags is set. */
__u32 ack_seq; /* Sequence number ACK'd */
@@ -914,11 +986,28 @@ struct tcp_skb_cb {
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
+#define for_retrans_queue(skb, sk, tp) \
+ for (skb = (sk)->write_queue.next; \
+ (skb != (tp)->send_head) && \
+ (skb != (struct sk_buff *)&(sk)->write_queue); \
+ skb=skb->next)
+
+
+#include <net/tcp_ecn.h>
+
+
/*
* Compute minimal free write space needed to queue new packets.
*/
-#define tcp_min_write_space(__sk) \
- (atomic_read(&(__sk)->wmem_alloc) / 2)
+static inline int tcp_min_write_space(struct sock *sk)
+{
+ return sk->wmem_queued/2;
+}
+
+static inline int tcp_wspace(struct sock *sk)
+{
+ return sk->sndbuf - sk->wmem_queued;
+}
/* This determines how many packets are "in the network" to the best
@@ -932,89 +1021,97 @@ struct tcp_skb_cb {
* Read this equation as:
*
* "Packets sent once on transmission queue" MINUS
- * "Packets acknowledged by FACK information" PLUS
+ * "Packets left network, but not honestly ACKed yet" PLUS
* "Packets fast retransmitted"
*/
static __inline__ int tcp_packets_in_flight(struct tcp_opt *tp)
{
- return tp->packets_out - tp->fackets_out + tp->retrans_out;
+ return tp->packets_out - tp->left_out + tp->retrans_out;
}
/* Recalculate snd_ssthresh, we want to set it to:
*
* one half the current congestion window, but no
* less than two segments
- *
- * We must take into account the current send window
- * as well, however we keep track of that using different
- * units so a conversion is necessary. -DaveM
- *
- * RED-PEN.
- * RFC 2581: "an easy mistake to make is to simply use cwnd,
- * rather than FlightSize"
- * I see no references to FlightSize here. snd_wnd is not FlightSize,
- * it is also apriory characteristics.
- *
- * FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ?
*/
extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
{
- u32 FlightSize = (tp->snd_nxt - tp->snd_una)/tp->mss_cache;
+ return max(tp->snd_cwnd>>1, 2);
+}
+
+/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
+ * The exception is rate halving phase, when cwnd is decreasing towards
+ * ssthresh.
+ */
+extern __inline__ __u32 tcp_current_ssthresh(struct tcp_opt *tp)
+{
+ if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery))
+ return tp->snd_ssthresh;
+ else
+ return max(tp->snd_ssthresh, (tp->snd_cwnd>>1)+(tp->snd_cwnd>>2));
+}
+
+extern void tcp_cwnd_application_limited(struct sock *sk);
- FlightSize = min(FlightSize, tcp_packets_in_flight(tp));
+/* Congestion window validation. (RFC2861) */
- return max(min(FlightSize, tp->snd_cwnd) >> 1, 2);
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
+{
+ if (tp->packets_out >= tp->snd_cwnd) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
+
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+ tcp_cwnd_application_limited(sk);
+ }
}
/* Set slow start threshould and cwnd not falling to slow start */
-extern __inline__ void __tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void __tcp_enter_cwr(struct tcp_opt *tp)
{
+ tp->undo_marker = 0;
tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
tp->snd_cwnd_cnt = 0;
tp->high_seq = tp->snd_nxt;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ TCP_ECN_queue_cwr(tp);
}
-extern __inline__ void tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void tcp_enter_cwr(struct tcp_opt *tp)
{
- if (!tp->high_seq || after(tp->snd_nxt, tp->high_seq))
- __tcp_enter_cong_avoid(tp);
+ tp->prior_ssthresh = 0;
+ if (tp->ca_state < TCP_CA_CWR) {
+ __tcp_enter_cwr(tp);
+ tp->ca_state = TCP_CA_CWR;
+ }
}
+extern __u32 tcp_init_cwnd(struct tcp_opt *tp);
-/* Increase initial CWND conservatively, i.e. only if estimated
- RTT is low enough. It is not quite correct, we should use
- POWER i.e. RTT*BANDWIDTH, but we still cannot estimate this.
-
- Numbers are taken from RFC1414.
+/* Slow start with delack produces 3 packets of burst, so that
+ * it is safe "de facto".
*/
-static __inline__ __u32 tcp_init_cwnd(struct tcp_opt *tp)
+static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp)
{
- __u32 cwnd;
-
- if (!tp->srtt || tp->srtt > ((HZ/50)<<3) || tp->mss_cache > 1460)
- cwnd = 2;
- else if (tp->mss_cache > 1095)
- cwnd = 3;
- else
- cwnd = 4;
-
- return min(cwnd, tp->snd_cwnd_clamp);
+ return 3;
}
-
static __inline__ int tcp_minshall_check(struct tcp_opt *tp)
{
return after(tp->snd_sml,tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
-static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, int len)
+static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb)
{
- if (len < mss)
- tp->snd_sml = tp->snd_nxt;
+ if (skb->len < mss)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
/* Return 0, if packet can be sent now without violation Nagle's rules:
@@ -1041,17 +1138,6 @@ static __inline__ int tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, u
static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
unsigned cur_mss, int tail)
{
- /*
- * Reset CWND after idle period longer RTO to "restart window".
- * It is "side" effect of the function, which is _not_ good
- * from viewpoint of clarity. But we have to make it before
- * checking congestion window below. Alternative is to prepend
- * all the calls with this test.
- */
- if (tp->packets_out==0 &&
- (s32)(tcp_time_stamp - tp->lsndtime) > tp->rto)
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_init_cwnd(tp));
-
/* RFC 1122 - section 4.2.3.4
*
* We must queue if
@@ -1062,8 +1148,7 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
* (part of SWS is done on packetization)
* Minshall version sounds: there are no _small_
* segments in flight. (tcp_nagle_check)
- * c) We are retransmiting [Nagle]
- * d) We have too many packets 'in flight'
+ * c) We have too many packets 'in flight'
*
* Don't use the nagle rule for urgent data (or
* for the final FIN -DaveM).
@@ -1081,13 +1166,12 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
skb_tailroom(skb) < 32) &&
((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
- !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
- tp->retransmits == 0);
+ !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
}
static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
{
- if (!tp->packets_out && !timer_pending(&tp->probe_timer))
+ if (!tp->packets_out && !tp->pending)
tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
}
@@ -1111,6 +1195,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
tcp_write_xmit(sk))
tcp_check_probe_timer(sk, tp);
}
+ tcp_cwnd_validate(sk, tp);
}
static __inline__ void tcp_push_pending_frames(struct sock *sk,
@@ -1119,6 +1204,24 @@ static __inline__ void tcp_push_pending_frames(struct sock *sk,
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk));
}
+static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb = tp->send_head;
+
+ return (skb &&
+ tcp_snd_test(tp, skb, tcp_current_mss(sk), tcp_skb_is_last(sk, skb)));
+}
+
+static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+ tp->snd_wl1 = seq;
+}
+
+static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+ tp->snd_wl1 = seq;
+}
+
extern void tcp_destroy_sock(struct sock *sk);
@@ -1143,7 +1246,6 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
__tcp_checksum_complete(skb);
}
-
/* Prequeue for VJ style copy to user, combined with checksumming. */
static __inline__ void tcp_prequeue_init(struct tcp_opt *tp)
@@ -1167,12 +1269,15 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (tp->ucopy.task) {
if ((tp->ucopy.memory += skb->truesize) <= (sk->rcvbuf<<1)) {
__skb_queue_tail(&tp->ucopy.prequeue, skb);
- if (skb_queue_len(&tp->ucopy.prequeue) == 1)
+ if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible(sk->sleep);
+ if (!tcp_ack_scheduled(tp))
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
+ }
} else {
NET_INC_STATS_BH(TCPPrequeueDropped);
tp->ucopy.memory -= skb->truesize;
- kfree_skb(skb);
+ __kfree_skb(skb);
}
return 1;
}
@@ -1231,6 +1336,13 @@ static __inline__ void tcp_done(struct sock *sk)
tcp_destroy_sock(sk);
}
+static __inline__ void tcp_sack_reset(struct tcp_opt *tp)
+{
+ tp->dsack = 0;
+ tp->eff_sacks = 0;
+ tp->num_sacks = 0;
+}
+
static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp)
{
if (tp->tstamp_ok) {
@@ -1241,17 +1353,22 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *
*ptr++ = htonl(tstamp);
*ptr++ = htonl(tp->ts_recent);
}
- if(tp->sack_ok && tp->num_sacks) {
+ if (tp->eff_sacks) {
+ struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)));
- for(this_sack = 0; this_sack < tp->num_sacks; this_sack++) {
- *ptr++ = htonl(tp->selective_acks[this_sack].start_seq);
- *ptr++ = htonl(tp->selective_acks[this_sack].end_seq);
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+ if (tp->dsack) {
+ tp->dsack = 0;
+ tp->eff_sacks--;
}
}
}
@@ -1330,42 +1447,44 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss,
space >>= 1;
(*rcv_wscale)++;
}
+ if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
+ space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2)
+ (*rcv_wscale)--;
+ }
+
+ /* Set initial window to value enough for senders,
+ * following RFC1414. Senders, not following this RFC,
+ * will be satisfied with 2.
+ */
+ if (mss > (1<<*rcv_wscale)) {
+ int init_cwnd = 4;
+ if (mss > 1460*3)
+ init_cwnd = 2;
+ else if (mss > 1460)
+ init_cwnd = 3;
+ if (*rcv_wnd > init_cwnd*mss)
+ *rcv_wnd = init_cwnd*mss;
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp);
}
+static inline int tcp_win_from_space(int space)
+{
+ return sysctl_tcp_adv_win_scale<=0 ?
+ (space>>(-sysctl_tcp_adv_win_scale)) :
+ space - (space>>sysctl_tcp_adv_win_scale);
+}
+
/* Note: caller must be prepared to deal with negative returns */
extern __inline__ int tcp_space(struct sock *sk)
{
- return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) /
- TCP_WINDOW_ADVERTISE_DIVISOR;
+ return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc));
}
extern __inline__ int tcp_full_space( struct sock *sk)
{
- return sk->rcvbuf / TCP_WINDOW_ADVERTISE_DIVISOR;
-}
-
-extern __inline__ void tcp_init_buffer_space(struct sock *sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int rcvbuf = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
- int sndbuf = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
-
- if (sk->rcvbuf < 3*rcvbuf)
- sk->rcvbuf = min (3*rcvbuf, sysctl_rmem_max);
-
- /* Reserve slack space to reduce jitter of advertised window. */
- if (tp->window_clamp >= tcp_full_space(sk)) {
- int nwin = tcp_full_space(sk) - tp->mss_clamp;
-
- if (nwin >= MAX_TCP_WINDOW && nwin >= 2*tp->advmss)
- tp->window_clamp = nwin;
- }
-
- if (sk->sndbuf < 3*sndbuf)
- sk->sndbuf = min (3*sndbuf, sysctl_wmem_max);
+ return tcp_win_from_space(sk->rcvbuf);
}
extern __inline__ void tcp_acceptq_removed(struct sock *sk)
@@ -1473,61 +1592,85 @@ static __inline__ void tcp_openreq_init(struct open_request *req,
req->snd_wscale = tp->snd_wscale;
req->wscale_ok = tp->wscale_ok;
req->acked = 0;
+ req->ecn_ok = 0;
req->rmt_port = skb->h.th->source;
}
-extern const char timer_bug_msg[];
+#define TCP_MEM_QUANTUM ((int)PAGE_SIZE)
-static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct timer_list *timer;
-
- switch (what) {
- case TCP_TIME_RETRANS:
- timer = &tp->retransmit_timer;
- break;
- case TCP_TIME_DACK:
- tp->ack.blocked = 0;
- timer = &tp->delack_timer;
- break;
- case TCP_TIME_PROBE0:
- timer = &tp->probe_timer;
- break;
- default:
- printk(timer_bug_msg);
- return;
- };
+ sk->tp_pinfo.af_tcp.queue_shrunk = 1;
+ sk->wmem_queued -= skb->truesize;
+ sk->forward_alloc += skb->truesize;
+ __kfree_skb(skb);
+}
- if (timer_pending(timer) && del_timer(timer))
- __sock_put(sk);
+static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb)
+{
+ sk->wmem_queued += skb->truesize;
+ sk->forward_alloc -= skb->truesize;
}
-/* This function does not return reliable answer. Use it only as advice.
- */
+extern void __tcp_mem_reclaim(struct sock *sk);
+extern int tcp_mem_schedule(struct sock *sk, int size, int kind);
-static inline int tcp_timer_is_set(struct sock *sk, int what)
+static inline void tcp_mem_reclaim(struct sock *sk)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- int ret;
+ if (sk->forward_alloc >= TCP_MEM_QUANTUM)
+ __tcp_mem_reclaim(sk);
+}
- switch (what) {
- case TCP_TIME_RETRANS:
- ret = timer_pending(&tp->retransmit_timer);
- break;
- case TCP_TIME_DACK:
- ret = timer_pending(&tp->delack_timer);
- break;
- case TCP_TIME_PROBE0:
- ret = timer_pending(&tp->probe_timer);
- break;
- default:
- ret = 0;
- printk(timer_bug_msg);
- };
- return ret;
+static inline void tcp_enter_memory_pressure(void)
+{
+ if (!tcp_memory_pressure) {
+ NET_INC_STATS(TCPMemoryPressures);
+ tcp_memory_pressure = 1;
+ }
}
+static inline void tcp_moderate_sndbuf(struct sock *sk)
+{
+ if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) {
+ sk->sndbuf = min(sk->sndbuf, sk->wmem_queued/2);
+ sk->sndbuf = max(sk->sndbuf, SOCK_MIN_SNDBUF);
+ }
+}
+
+static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp)
+{
+ struct sk_buff *skb = alloc_skb(size, gfp);
+
+ if (skb) {
+ if (sk->forward_alloc >= (int)skb->truesize ||
+ tcp_mem_schedule(sk, skb->truesize, 0))
+ return skb;
+ __kfree_skb(skb);
+ } else {
+ tcp_enter_memory_pressure();
+ tcp_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
+
+static inline void tcp_writequeue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->write_queue)) != NULL)
+ tcp_free_skb(sk, skb);
+ tcp_mem_reclaim(sk);
+}
+
+extern void tcp_rfree(struct sk_buff *skb);
+
+static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb->sk = sk;
+ skb->destructor = tcp_rfree;
+ atomic_add(skb->truesize, &sk->rmem_alloc);
+ sk->forward_alloc -= skb->truesize;
+}
extern void tcp_listen_wlock(void);
@@ -1570,28 +1713,30 @@ static inline int tcp_fin_time(struct tcp_opt *tp)
return fin_timeout;
}
-#if 0 /* TCP_DEBUG */
-#define TCP_CHECK_TIMER(sk) \
-do { struct tcp_opt *__tp = &sk->tp_pinfo.af_tcp; \
- if (sk->state != TCP_CLOSE) { \
- if (__tp->packets_out) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS) && !timer_is_running(&__tp->retransmit_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p RETRANS" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } else if (__tp->send_head) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_PROBE0) && !timer_is_running(&__tp->probe_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p PROBE0" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } \
- if (__tp->ack.pending) { \
- if (!tcp_timer_is_set(sk, TCP_TIME_DACK) && !timer_is_running(&__tp->delack_timer) && net_ratelimit()) \
- printk(KERN_DEBUG "sk=%p DACK" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
- } \
- if (__tp->packets_out > skb_queue_len(&sk->write_queue) || \
- (__tp->send_head && skb_queue_len(&sk->write_queue) == 0)) { \
- printk(KERN_DEBUG "sk=%p QUEUE" __FUNCTION__ "(%d) %d %d %d %p\n", sk, __LINE__, sk->state, __tp->packets_out, skb_queue_len(&sk->write_queue), __tp->send_head); \
- } \
- } } while (0)
-#else
+static inline int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+ if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+ return 0;
+ if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+ return 0;
+
+ /* RST segments are not recommended to carry timestamp,
+ and, if they do, it is recommended to ignore PAWS because
+ "their cleanup function should take precedence over timestamps."
+ Certainly, it is mistake. It is necessary to understand the reasons
+ of this constraint to relax it: if peer reboots, clock may go
+ out-of-sync and half-open connections will not be reset.
+ Actually, the problem would be not existing if all
+ the implementations followed draft about maintaining clock
+ via reboots. Linux-2.2 DOES NOT!
+
+ However, we can relax time bounds for RST segments to MSL.
+ */
+ if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+ return 0;
+ return 1;
+}
+
#define TCP_CHECK_TIMER(sk) do { } while (0);
-#endif
#endif /* _TCP_H */
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
new file mode 100644
index 000000000..db004249f
--- /dev/null
+++ b/include/net/tcp_ecn.h
@@ -0,0 +1,155 @@
+#ifndef _NET_TCP_ECN_H_
+#define _NET_TCP_ECN_H_ 1
+
+#include <linux/config.h>
+
+#ifdef CONFIG_INET_ECN
+
+#include <net/inet_ecn.h>
+
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)|TCP_FLAG_ECE|TCP_FLAG_CWR)
+
+#define TCP_ECN_OK 1
+#define TCP_ECN_QUEUE_CWR 2
+#define TCP_ECN_DEMAND_CWR 4
+
+static __inline__ void
+TCP_ECN_queue_cwr(struct tcp_opt *tp)
+{
+ if (tp->ecn_flags&TCP_ECN_OK)
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+
+/* Output functions */
+
+static __inline__ void
+TCP_ECN_send_synack(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+ if (!(tp->ecn_flags&TCP_ECN_OK))
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+}
+
+static __inline__ void
+TCP_ECN_send_syn(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ tp->ecn_flags = 0;
+ if (sysctl_tcp_ecn) {
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
+ tp->ecn_flags = TCP_ECN_OK;
+ }
+}
+
+static __inline__ void
+TCP_ECN_make_synack(struct open_request *req, struct tcphdr *th)
+{
+ if (req->ecn_ok)
+ th->ece = 1;
+}
+
+static __inline__ void
+TCP_ECN_send(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len)
+{
+ if (tp->ecn_flags & TCP_ECN_OK) {
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ if (skb->len != tcp_header_len &&
+ !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+ INET_ECN_xmit(sk);
+ if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+ skb->h.th->cwr = 1;
+ }
+ } else {
+ /* ACK or retransmitted segment: clear ECT|CE */
+ INET_ECN_dontxmit(sk);
+ }
+ if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+ skb->h.th->ece = 1;
+ }
+}
+
+/* Input functions */
+
+static __inline__ void
+TCP_ECN_accept_cwr(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ if (skb->h.th->cwr)
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static __inline__ void
+TCP_ECN_check_ce(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ if (tp->ecn_flags&TCP_ECN_OK) {
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ /* Funny extension: if ECT is not set on a segment,
+ * it is surely retransmit. It is not in ECN RFC,
+ * but Linux follows this rule. */
+ else if (!INET_ECN_is_capable((TCP_SKB_CB(skb)->flags)))
+ tcp_enter_quickack_mode(tp);
+ }
+}
+
+static __inline__ void
+TCP_ECN_rcv_synack(struct tcp_opt *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static __inline__ void
+TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static __inline__ int
+TCP_ECN_rcv_ecn_echo(struct tcp_opt *tp, struct tcphdr *th)
+{
+ if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
+ return 1;
+ return 0;
+}
+
+static __inline__ void
+TCP_ECN_openreq_child(struct tcp_opt *tp, struct open_request *req)
+{
+ tp->ecn_flags = req->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+static __inline__ void
+TCP_ECN_create_request(struct open_request *req, struct tcphdr *th)
+{
+ if (sysctl_tcp_ecn && th->ece && th->cwr)
+ req->ecn_ok = 1;
+}
+
+
+
+#else
+
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+
+#define TCP_ECN_send_syn(x...) do { } while (0)
+#define TCP_ECN_send_synack(x...) do { } while (0)
+#define TCP_ECN_make_synack(x...) do { } while (0)
+#define TCP_ECN_send(x...) do { } while (0)
+
+#define TCP_ECN_queue_cwr(x...) do { } while (0)
+
+#define TCP_ECN_accept_cwr(x...) do { } while (0)
+#define TCP_ECN_check_ce(x...) do { } while (0)
+#define TCP_ECN_rcv_synack(x...) do { } while (0)
+#define TCP_ECN_rcv_syn(x...) do { } while (0)
+#define TCP_ECN_rcv_ecn_echo(x...) (0)
+#define TCP_ECN_openreq_child(x...) do { } while (0)
+#define TCP_ECN_create_request(x...) do { } while (0)
+
+
+#endif
+
+#endif