5 files changed, 642 insertions, 279 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index 4bca9c092..253d72a22 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -44,6 +44,7 @@ struct dst_entry
 	unsigned		ssthresh;
 	unsigned		cwnd;
 	unsigned		advmss;
+	unsigned		reordering;
 
 	unsigned long		rate_last;	/* rate limiting for ICMP */
 	unsigned long		rate_tokens;
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 8bcb17085..2bd127299 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -199,7 +199,45 @@ struct linux_mib
 	unsigned long	TCPPrequeueDropped;
 	unsigned long	TCPHPHits;
 	unsigned long	TCPHPHitsToUser;
-	unsigned long	__pad[32-26];
+	unsigned long	TCPPureAcks;
+	unsigned long	TCPHPAcks;
+	unsigned long	TCPRenoRecovery;
+	unsigned long	TCPSackRecovery;
+	unsigned long	TCPSACKReneging;
+	unsigned long	TCPFACKReorder;
+	unsigned long	TCPSACKReorder;
+	unsigned long	TCPRenoReorder;
+	unsigned long	TCPTSReorder;
+	unsigned long	TCPFullUndo;
+	unsigned long	TCPPartialUndo;
+	unsigned long	TCPDSACKUndo;
+	unsigned long	TCPLossUndo;
+	unsigned long	TCPLoss;
+	unsigned long	TCPLostRetransmit;
+	unsigned long	TCPRenoFailures;
+	unsigned long	TCPSackFailures;
+	unsigned long	TCPLossFailures;
+	unsigned long	TCPFastRetrans;
+	unsigned long	TCPForwardRetrans;
+	unsigned long	TCPSlowStartRetrans;
+	unsigned long	TCPTimeouts;
+	unsigned long	TCPRenoRecoveryFail;
+	unsigned long	TCPSackRecoveryFail;
+	unsigned long	TCPSchedulerFailed;
+	unsigned long	TCPRcvCollapsed;
+	unsigned long	TCPDSACKOldSent;
+	unsigned long	TCPDSACKOfoSent;
+	unsigned long	TCPDSACKRecv;
+	unsigned long	TCPDSACKOfoRecv;
+	unsigned long	TCPAbortOnSyn;
+	unsigned long	TCPAbortOnData;
+	unsigned long	TCPAbortOnClose;
+	unsigned long	TCPAbortOnMemory;
+	unsigned long	TCPAbortOnTimeout;
+	unsigned long	TCPAbortOnLinger;
+	unsigned long	TCPAbortFailed;
+	unsigned long	TCPMemoryPressures;
+	unsigned long	__pad[64-64];
 };
 
 #define SNMP_INC_STATS(mib, field) ((mib)[2*smp_processor_id()+!in_softirq()].field++)
diff --git a/include/net/sock.h b/include/net/sock.h
index 87a8c3941..38b5549d6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -268,10 +268,10 @@ struct tcp_opt {
 		__u8	pingpong;	/* The session is interactive		*/
 		__u8	blocked;	/* Delayed ACK was blocked by socket lock*/
 		__u32	ato;		/* Predicted tick of soft clock		*/
+		unsigned long timeout;	/* Currently scheduled timeout		*/
 		__u32	lrcvtime;	/* timestamp of last received data packet*/
-		__u16	last_seg_size;	/* Size of last incoming segment */
-		__u16	rcv_mss;	/* MSS used for delayed ACK decisions */ 
-		__u32	rcv_segs;	/* Number of received segments since last ack */
+		__u16	last_seg_size;	/* Size of last incoming segment	*/
+		__u16	rcv_mss;	/* MSS used for delayed ACK decisions	*/ 
 	} ack;
 
 	/* Data for direct copy to user */
@@ -284,19 +284,18 @@ struct tcp_opt {
 	} ucopy;
 
 	__u32	snd_wl1;	/* Sequence for window update		*/
-	__u32	snd_wl2;	/* Ack sequence for update		*/
 	__u32	snd_wnd;	/* The window we expect to receive	*/
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
 	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
 	__u16	mss_cache;	/* Cached effective mss, not including SACKS */
 	__u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
-	__u8	dup_acks;	/* Consecutive duplicate acks seen from other end */
-	__u8	retransmits;
+	__u8	ca_state;	/* State of fast-retransmit machine 	*/
+	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
 
-	__u8	__empty1;
-	__u8	sorry;
-	__u8	defer_accept;
+	__u8	reordering;	/* Packet reordering metric.		*/
+	__u8	queue_shrunk;	/* Write queue has been shrunk recently.*/
+	__u8	defer_accept;	/* User waits for some data after accept() */
 
 /* RTT measurement */
 	__u8	backoff;	/* backoff				*/
@@ -305,9 +304,9 @@ struct tcp_opt {
 	__u32	rto;		/* retransmit timeout			*/
 
 	__u32	packets_out;	/* Packets which are "in flight"	*/
-	__u32	fackets_out;	/* Non-retrans SACK'd packets		*/
-	__u32	retrans_out;	/* Fast-retransmitted packets out	*/
-	__u32	high_seq;	/* snd_nxt at onset of congestion	*/
+	__u32	left_out;	/* Packets which leaved network		*/
+	__u32	retrans_out;	/* Retransmitted packets out		*/
+
 
 /*
  *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
@@ -316,12 +315,11 @@ struct tcp_opt {
  	__u32	snd_cwnd;	/* Sending congestion window		*/
  	__u16	snd_cwnd_cnt;	/* Linear increase counter		*/
 	__u16	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
-
-	__u8	nonagle;	/* Disable Nagle algorithm?             */
-	__u8	syn_retries;	/* num of allowed syn retries */
-	__u16	user_mss;  	/* mss requested by user in ioctl */
+	__u32	snd_cwnd_used;
+	__u32	snd_cwnd_stamp;
 
 	/* Two commonly used timers in both sender and receiver paths. */
+	unsigned long		timeout;
  	struct timer_list	retransmit_timer;	/* Resend (no ack)	*/
  	struct timer_list	delack_timer;		/* Ack delay 		*/
 
@@ -329,16 +327,12 @@ struct tcp_opt {
 
 	struct tcp_func		*af_specific;	/* Operations which are AF_INET{4,6} specific	*/
 	struct sk_buff		*send_head;	/* Front of stuff to transmit			*/
-	struct sk_buff		*retrans_head;	/* retrans head can be 
-						 * different to the head of
-						 * write queue if we are doing
-						 * fast retransmit
-						 */
 
  	__u32	rcv_wnd;	/* Current receiver window		*/
 	__u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
-	__u32	write_seq;
-	__u32	copied_seq;
+	__u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+	__u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
+	__u32	copied_seq;	/* Head of yet unread data		*/
 /*
  *      Options received (usually on last packet, some only on SYN packets).
  */
@@ -348,7 +342,7 @@ struct tcp_opt {
 	char	saw_tstamp;	/* Saw TIMESTAMP on last packet		*/
         __u8	snd_wscale;	/* Window scaling received from sender	*/
         __u8	rcv_wscale;	/* Window scaling to send to receiver	*/
-	__u8	rexmt_done;	/* Retransmitted up to send head?	*/
+	__u8	nonagle;	/* Disable Nagle algorithm?             */
 	__u8	keepalive_probes; /* num of allowed keep alive probes	*/
 
 /*	PAWS/RTTM data	*/
@@ -358,19 +352,37 @@ struct tcp_opt {
         long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
 
 /*	SACKs data	*/
+	__u16	user_mss;  	/* mss requested by user in ioctl */
+	__u8	dsack;		/* D-SACK is scheduled			*/
+	__u8	eff_sacks;	/* Size of SACK array to send with next packet */
+	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
- 	struct timer_list	probe_timer;		/* Probes	*/
 	__u32	window_clamp;	/* Maximal window to advertise		*/
+	__u32	rcv_ssthresh;	/* Current window clamp			*/
 	__u8	probes_out;	/* unanswered 0 window probes		*/
 	__u8	num_sacks;	/* Number of SACK blocks		*/
 	__u16	advmss;		/* Advertised MSS			*/
 
-	__u32	syn_stamp;
-	__u32	syn_seq;
-	__u32	fin_seq;
-	__u32	urg_seq;
-	__u32	urg_data;
+	__u8	syn_retries;	/* num of allowed syn retries */
+	__u8	ecn_flags;	/* ECN status bits.			*/
+	__u16	prior_ssthresh; /* ssthresh saved at recovery start	*/
+	__u32	lost_out;	/* Lost packets				*/
+	__u32	sacked_out;	/* SACK'd packets			*/
+	__u32	fackets_out;	/* FACK'd packets			*/
+	__u32	high_seq;	/* snd_nxt at onset of congestion	*/
+
+	__u32	retrans_stamp;	/* Timestamp of the last retransmit,
+				 * also used in SYN-SENT to remember stamp of
+				 * the first SYN. */
+	__u32	undo_marker;	/* tracking retrans started here. */
+	int	undo_retrans;	/* number of undoable retransmissions. */
+	__u32	syn_seq;	/* Seq of received SYN. */
+	__u32	fin_seq;	/* Seq of received FIN. */
+	__u32	urg_seq;	/* Seq of received urgent pointer */
+	__u16	urg_data;	/* Saved octet of OOB data and control flags */
+	__u8	pending;	/* Scheduled timer event	*/
+	__u8	__empty;
 
 	/* The syn_wait_lock is necessary only to avoid tcp_get_info having
 	 * to grab the main lock sock while browsing the listening hash
@@ -482,8 +494,8 @@ struct sock {
 	__u16			sport;		/* Source port				*/
 
 	unsigned short		family;		/* Address family			*/
-	unsigned char		reuse,		/* SO_REUSEADDR setting			*/
-				__unused;
+	unsigned char		reuse;		/* SO_REUSEADDR setting			*/
+	unsigned char		shutdown;
 	atomic_t		refcnt;		/* Reference count			*/
 
 	socket_lock_t		lock;		/* Synchronizer...			*/
@@ -497,6 +509,8 @@ struct sock {
 	atomic_t		wmem_alloc;	/* Transmit queue bytes committed	*/
 	struct sk_buff_head	write_queue;	/* Packet sending queue			*/
 	atomic_t		omem_alloc;	/* "o" is "option" or "other" */
+	int			wmem_queued;	/* Persistent queue size */
+	int			forward_alloc;	/* Space allocated forward. */
 	__u32			saddr;		/* Sending source			*/
 	unsigned int		allocation;	/* Allocation mode			*/
 	int			sndbuf;		/* Size of send buffer in bytes		*/
@@ -539,8 +553,6 @@ struct sock {
 
 	struct proto		*prot;
 
-	unsigned short		shutdown;
-
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 	union {
 		struct ipv6_pinfo	af_inet6;
@@ -734,6 +746,11 @@ static void __inline__ sock_prot_dec_use(struct proto *prot)
 #define RCV_SHUTDOWN	1
 #define SEND_SHUTDOWN	2
 
+#define SOCK_SNDBUF_LOCK	1
+#define SOCK_RCVBUF_LOCK	2
+#define SOCK_BINDADDR_LOCK	4
+
+
 /* Used by processes to "lock" a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
@@ -801,8 +818,6 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      int priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
-extern void			sock_cfree(struct sk_buff *skb);
-extern unsigned long		sock_rspace(struct sock *sk);
 extern unsigned long		sock_wspace(struct sock *sk);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
@@ -982,8 +997,6 @@ extern __inline__ void sock_put(struct sock *sk)
  * we do not release it in this function, because protocol
  * probably wants some additional cleanups or even continuing
  * to work with this socket (TCP).
- *
- * NOTE: When softnet goes in replace _irq with _bh!
  */
 extern __inline__ void sock_orphan(struct sock *sk)
 {
@@ -1003,6 +1016,25 @@ extern __inline__ void sock_graft(struct sock *sk, struct socket *parent)
 	write_unlock_bh(&sk->callback_lock);
 }
 
+static inline int sock_i_uid(struct sock *sk)
+{
+	int uid;
+
+	read_lock(&sk->callback_lock);
+	uid = sk->socket ? sk->socket->inode->i_uid : 0;
+	read_unlock(&sk->callback_lock);
+	return uid;
+}
+
+static inline unsigned long sock_i_ino(struct sock *sk)
+{
+	unsigned long ino;
+
+	read_lock(&sk->callback_lock);
+	ino = sk->socket ? sk->socket->inode->i_ino : 0;
+	read_unlock(&sk->callback_lock);
+	return ino;
+}
 
 extern __inline__ struct dst_entry *
 __sk_dst_get(struct sock *sk)
@@ -1110,14 +1142,6 @@ extern __inline__ void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	atomic_add(skb->truesize, &sk->rmem_alloc);
 }
 
-extern __inline__ void skb_set_owner_c(struct sk_buff *skb, struct sock *sk)
-{
-	sock_hold(sk);
-	skb->sk = sk;
-	skb->destructor = sock_cfree;
-}
-
-
 extern __inline__ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
@@ -1194,7 +1218,7 @@ extern __inline__ void sk_wake_async(struct sock *sk, int how, int band)
 }
 
 #define SOCK_MIN_SNDBUF 2048
-#define SOCK_MIN_RCVBUF 128
+#define SOCK_MIN_RCVBUF 256
 /* Must be less or equal SOCK_MIN_SNDBUF */
 #define SOCK_MIN_WRITE_SPACE	SOCK_MIN_SNDBUF
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7df845895..d3a63962c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -19,9 +19,13 @@
 #define _TCP_H
 
 #define TCP_DEBUG 1
+#define FASTRETRANS_DEBUG 1
+
+/* Be paranoid about data immediately beyond right edge of window. */
 #undef  TCP_FORMAL_WINDOW
-#define TCP_MORE_COARSE_ACKS
-#undef  TCP_LESS_COARSE_ACKS
+
+/* Cancel timers, when they are not required. */
+#undef TCP_CLEAR_TIMERS
 
 #include <linux/config.h>
 #include <linux/tcp.h>
@@ -173,7 +177,7 @@ extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw)
 }
 
 extern atomic_t tcp_orphan_count;
-extern int  tcp_tw_count;
+extern int tcp_tw_count;
 extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
@@ -242,12 +246,14 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 /* Minimal RCV_MSS. */
 #define TCP_MIN_RCVMSS		536
 
-/* 
- * How much of the receive buffer do we advertize 
- * (the rest is reserved for headers and driver packet overhead)
- * Use a power of 2.
- */
-#define TCP_WINDOW_ADVERTISE_DIVISOR 2
+/* After receiving this amount of duplicate ACKs fast retransmit starts. */
+#define TCP_FASTRETRANS_THRESH 3
+
+/* Maximal reordering. */
+#define TCP_MAX_REORDERING	127
+
+/* Maximal number of ACKs sent quickly to accelerate slow-start. */
+#define TCP_MAX_QUICKACKS	16
 
 /* urg_data states */
 #define TCP_URG_VALID	0x0100
@@ -292,7 +298,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 #define TCP_DELACK_MAX	(HZ/5)	/* maximal time to delay before sending an ACK */
 #define TCP_DELACK_MIN	(2)	/* minimal time to delay before sending an ACK,
 				 * 2 scheduler ticks, not depending on HZ. */
-#define TCP_ATO_MAX	(HZ/2)	/* Clamp ATO estimator at his value. */
 #define TCP_ATO_MIN	2
 #define TCP_RTO_MAX	(120*HZ)
 #define TCP_RTO_MIN	(HZ/5)
@@ -414,6 +419,19 @@ extern int sysctl_tcp_tw_recycle;
 extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_max_tw_buckets;
+extern int sysctl_tcp_fack;
+extern int sysctl_tcp_reordering;
+extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_dsack;
+extern int sysctl_tcp_mem[3];
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+extern int sysctl_tcp_app_win;
+extern int sysctl_tcp_adv_win_scale;
+
+extern atomic_t tcp_memory_allocated;
+extern atomic_t tcp_sockets_allocated;
+extern int tcp_memory_pressure;
 
 struct open_request;
 
@@ -606,6 +624,23 @@ extern int			tcp_rcv_established(struct sock *sk,
 						    struct tcphdr *th, 
 						    unsigned len);
 
+enum tcp_ack_state_t
+{
+	TCP_ACK_SCHED = 1,
+	TCP_ACK_TIMER = 2,
+	TCP_ACK_PUSHED= 4
+};
+
+static inline void tcp_schedule_ack(struct tcp_opt *tp)
+{
+	tp->ack.pending |= TCP_ACK_SCHED;
+}
+
+static inline int tcp_ack_scheduled(struct tcp_opt *tp)
+{
+	return tp->ack.pending&TCP_ACK_SCHED;
+}
+
 static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
 {
 	if (tp->ack.quick && --tp->ack.quick == 0) {
@@ -614,11 +649,27 @@ static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
 	}
 }
 
+extern void tcp_enter_quickack_mode(struct tcp_opt *tp);
+
 static __inline__ void tcp_delack_init(struct tcp_opt *tp)
 {
 	memset(&tp->ack, 0, sizeof(tp->ack));
 }
 
+enum tcp_ca_state
+{
+	TCP_CA_Open = 0,
+#define TCPF_CA_Open	(1<<TCP_CA_Open)
+	TCP_CA_Disorder = 1,
+#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
+	TCP_CA_CWR = 2,
+#define TCPF_CA_CWR	(1<<TCP_CA_CWR)
+	TCP_CA_Recovery = 3,
+#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
+	TCP_CA_Loss = 4
+#define TCPF_CA_Loss	(1<<TCP_CA_Loss)
+};
+
 
 enum tcp_tw_status
 {
@@ -640,6 +691,9 @@ extern struct sock *		tcp_check_req(struct sock *sk,struct sk_buff *skb,
 extern int			tcp_child_process(struct sock *parent,
 						  struct sock *child,
 						  struct sk_buff *skb);
+extern void			tcp_enter_loss(struct sock *sk, int how);
+extern void			tcp_clear_retrans(struct tcp_opt *tp);
+extern void			tcp_update_metrics(struct sock *sk);
 
 extern void			tcp_close(struct sock *sk, 
 					  long timeout);
@@ -661,8 +715,8 @@ extern int			tcp_recvmsg(struct sock *sk,
 
 extern int			tcp_listen_start(struct sock *sk);
 
-extern void			tcp_parse_options(struct sock *sk, struct tcphdr *th,
-						  struct tcp_opt *tp, int no_fancy);
+extern void			tcp_parse_options(struct sk_buff *skb,
+						  struct tcp_opt *tp);
 
 /*
  *	TCP v4 functions exported for the inet6 API
@@ -720,7 +774,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
 
 extern int tcp_write_xmit(struct sock *);
 extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
-extern void tcp_fack_retransmit(struct sock *);
 extern void tcp_xmit_retransmit_queue(struct sock *);
 extern void tcp_simple_retransmit(struct sock *);
 
@@ -736,7 +789,6 @@ extern void tcp_send_ack(struct sock *sk);
 extern void tcp_send_delayed_ack(struct sock *sk);
 
 /* tcp_timer.c */
-extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long);
 extern void tcp_init_xmit_timers(struct sock *);
 extern void tcp_clear_xmit_timers(struct sock *);
 
@@ -744,6 +796,76 @@ extern void tcp_delete_keepalive_timer (struct sock *);
 extern void tcp_reset_keepalive_timer (struct sock *, unsigned long);
 extern int tcp_sync_mss(struct sock *sk, u32 pmtu);
 
+extern const char timer_bug_msg[];
+
+
+static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	
+	switch (what) {
+	case TCP_TIME_RETRANS:
+	case TCP_TIME_PROBE0:
+		tp->pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+		if (timer_pending(&tp->retransmit_timer) &&
+		    del_timer(&tp->retransmit_timer))
+			__sock_put(sk);
+#endif
+		break;
+	case TCP_TIME_DACK:
+		tp->ack.blocked = 0;
+		tp->ack.pending = 0;
+
+#ifdef TCP_CLEAR_TIMERS
+		if (timer_pending(&tp->delack_timer) &&
+		    del_timer(&tp->delack_timer))
+			__sock_put(sk);
+#endif
+		break;
+	default:
+		printk(timer_bug_msg);
+		return;
+	};
+
+}
+
+/*
+ *	Reset the retransmission timer
+ */
+static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	if (when > TCP_RTO_MAX) {
+#ifdef TCP_DEBUG
+		printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
+#endif
+		when = TCP_RTO_MAX;
+	}
+
+	switch (what) {
+	case TCP_TIME_RETRANS:
+	case TCP_TIME_PROBE0:
+		tp->pending = what;
+		tp->timeout = jiffies+when;
+		if (!mod_timer(&tp->retransmit_timer, tp->timeout))
+			sock_hold(sk);
+		break;
+
+	case TCP_TIME_DACK:
+		tp->ack.pending |= TCP_ACK_TIMER;
+		tp->ack.timeout = jiffies+when;
+		if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
+			sock_hold(sk);
+		break;
+
+	default:
+		printk(KERN_DEBUG "bug: unknown timer value\n");
+	};
+}
+
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  */
@@ -757,9 +879,9 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
 	if (dst && dst->pmtu != tp->pmtu_cookie)
 		mss_now = tcp_sync_mss(sk, dst->pmtu);
 
-	if(tp->sack_ok && tp->num_sacks)
+	if (tp->eff_sacks)
 		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-			    (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
+			    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
 	return mss_now;
 }
 
@@ -774,15 +896,8 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
 extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	struct dst_entry *dst = __sk_dst_get(sk);
-	int mss;
 
-	if (dst)
-		mss = dst->advmss;
-	else
-		mss = tp->mss_cache;
-
-	tp->ack.rcv_mss = max(min(mss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
+	tp->ack.rcv_mss = max(min(tp->advmss, TCP_MIN_RCVMSS), TCP_MIN_MSS);
 }
 
 static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd)
@@ -797,9 +912,6 @@ static __inline__ void tcp_fast_path_on(struct tcp_opt *tp)
 	__tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale);
 }
 
-
-
-
 /* Compute the actual receive window we are currently advertising.
  * Rcv_nxt can be after the window if our peer push more data
  * than the offered window.
@@ -819,52 +931,6 @@ static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
  */
 extern u32	__tcp_select_window(struct sock *sk);
 
-/* Chose a new window to advertise, update state in tcp_opt for the
- * socket, and return result with RFC1323 scaling applied.  The return
- * value can be stuffed directly into th->window for an outgoing
- * frame.
- */
-extern __inline__ u16 tcp_select_window(struct sock *sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	u32 cur_win = tcp_receive_window(tp);
-	u32 new_win = __tcp_select_window(sk);
-
-	/* Never shrink the offered window */
-	if(new_win < cur_win) {
-		/* Danger Will Robinson!
-		 * Don't update rcv_wup/rcv_wnd here or else
-		 * we will not be able to advertise a zero
-		 * window in time.  --DaveM
-		 *
-		 * Relax Will Robinson.
-		 */
-		new_win = cur_win;
-	}
-	tp->rcv_wnd = new_win;
-	tp->rcv_wup = tp->rcv_nxt;
-
-	/* RFC1323 scaling applied */
-	new_win >>= tp->rcv_wscale;
-
-#ifdef TCP_FORMAL_WINDOW
-	if (new_win == 0) {
-		/* If we advertise zero window, disable fast path. */
-		tp->pred_flags = 0;
-	} else if (cur_win == 0 && tp->pred_flags == 0 &&
-		   skb_queue_len(&tp->out_of_order_queue) == 0 &&
-		   !tp->urg_data) {
-		/* If we open zero window, enable fast path.
-		   Without this it will be open by the first data packet,
-		   it is too late to merge checksumming to copy.
-		 */
-		tcp_fast_path_on(tp);
-	}
-#endif
-
-	return new_win;
-}
-
 /* TCP timestamps are only 32-bits, this causes a slight
  * complication on 64-bit systems since we store a snapshot
  * of jiffies in the buffer control blocks below.  We decidely
@@ -907,6 +973,12 @@ struct tcp_skb_cb {
 	__u8		sacked;		/* State flags for SACK/FACK.	*/
 #define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
+#define TCPCB_LOST		0x04	/* SKB is lost			*/
+#define TCPCB_TAGBITS		0x07	/* All tag bits			*/
+
+#define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
+#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+
 
 	__u16		urg_ptr;	/* Valid w/URG flags is set.	*/
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
@@ -914,11 +986,28 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
+#define for_retrans_queue(skb, sk, tp) \
+		for (skb = (sk)->write_queue.next;			\
+		     (skb != (tp)->send_head) &&			\
+		     (skb != (struct sk_buff *)&(sk)->write_queue);	\
+		     skb=skb->next)
+
+
+#include <net/tcp_ecn.h>
+
+
 /*
  *	Compute minimal free write space needed to queue new packets. 
  */
-#define tcp_min_write_space(__sk) \
-	(atomic_read(&(__sk)->wmem_alloc) / 2)
+static inline int tcp_min_write_space(struct sock *sk)
+{
+	return sk->wmem_queued/2;
+}
+ 
+static inline int tcp_wspace(struct sock *sk)
+{
+	return sk->sndbuf - sk->wmem_queued;
+}
 
 
 /* This determines how many packets are "in the network" to the best
@@ -932,89 +1021,97 @@ struct tcp_skb_cb {
  * Read this equation as:
  *
  *	"Packets sent once on transmission queue" MINUS
- *	"Packets acknowledged by FACK information" PLUS
+ *	"Packets left network, but not honestly ACKed yet" PLUS
  *	"Packets fast retransmitted"
  */
 static __inline__ int tcp_packets_in_flight(struct tcp_opt *tp)
 {
-	return tp->packets_out - tp->fackets_out + tp->retrans_out;
+	return tp->packets_out - tp->left_out + tp->retrans_out;
 }
 
 /* Recalculate snd_ssthresh, we want to set it to:
  *
  * 	one half the current congestion window, but no
  *	less than two segments
- *
- * We must take into account the current send window
- * as well, however we keep track of that using different
- * units so a conversion is necessary.  -DaveM
- *
- * RED-PEN.
- *  RFC 2581: "an easy mistake to make is to simply use cwnd,
- *             rather than FlightSize"
- * I see no references to FlightSize here. snd_wnd is not FlightSize,
- * it is also apriory characteristics.
- *
- *   FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ?
  */
 extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
 {
-	u32 FlightSize = (tp->snd_nxt - tp->snd_una)/tp->mss_cache;
+	return max(tp->snd_cwnd>>1, 2);
+}
+
+/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
+ * The exception is rate halving phase, when cwnd is decreasing towards
+ * ssthresh.
+ */
+extern __inline__ __u32 tcp_current_ssthresh(struct tcp_opt *tp)
+{
+	if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery))
+		return tp->snd_ssthresh;
+	else
+		return max(tp->snd_ssthresh, (tp->snd_cwnd>>1)+(tp->snd_cwnd>>2));
+}
+
+extern void tcp_cwnd_application_limited(struct sock *sk);
 
-	FlightSize = min(FlightSize, tcp_packets_in_flight(tp));
+/* Congestion window validation. (RFC2861) */
 
-	return max(min(FlightSize, tp->snd_cwnd) >> 1, 2);
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
+{
+	if (tp->packets_out >= tp->snd_cwnd) {
+		/* Network is feed fully. */
+		tp->snd_cwnd_used = 0;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	} else {
+		/* Network starves. */
+		if (tp->packets_out > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tp->packets_out;
+
+		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+			tcp_cwnd_application_limited(sk);
+	}
 }
 
 /* Set slow start threshould and cwnd not falling to slow start */
-extern __inline__ void __tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void __tcp_enter_cwr(struct tcp_opt *tp)
 {
+	tp->undo_marker = 0;
 	tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-	if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-		tp->snd_ssthresh = tp->snd_cwnd_clamp;
-	tp->snd_cwnd = tp->snd_ssthresh;
+	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
 	tp->snd_cwnd_cnt = 0;
 	tp->high_seq = tp->snd_nxt;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	TCP_ECN_queue_cwr(tp);
 }
 
-extern __inline__ void tcp_enter_cong_avoid(struct tcp_opt *tp)
+extern __inline__ void tcp_enter_cwr(struct tcp_opt *tp)
 {
-	if (!tp->high_seq || after(tp->snd_nxt, tp->high_seq))
-		__tcp_enter_cong_avoid(tp);
+	tp->prior_ssthresh = 0;
+	if (tp->ca_state < TCP_CA_CWR) {
+		__tcp_enter_cwr(tp);
+		tp->ca_state = TCP_CA_CWR;
+	}
 }
 
+extern __u32 tcp_init_cwnd(struct tcp_opt *tp);
 
-/* Increase initial CWND conservatively, i.e. only if estimated
-   RTT is low enough. It is not quite correct, we should use
-   POWER i.e. RTT*BANDWIDTH, but we still cannot estimate this.
-
-   Numbers are taken from RFC1414.
+/* Slow start with delack produces 3 packets of burst, so that
+ * it is safe "de facto".
  */
-static __inline__ __u32 tcp_init_cwnd(struct tcp_opt *tp)
+static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp)
 {
-	__u32 cwnd;
-
-	if (!tp->srtt || tp->srtt > ((HZ/50)<<3) || tp->mss_cache > 1460)
-		cwnd = 2;
-	else if (tp->mss_cache > 1095)
-		cwnd = 3;
-	else
-		cwnd = 4;
-
-	return min(cwnd, tp->snd_cwnd_clamp);
+	return 3;
 }
 
-
 static __inline__ int tcp_minshall_check(struct tcp_opt *tp)
 {
 	return after(tp->snd_sml,tp->snd_una) &&
 		!after(tp->snd_sml, tp->snd_nxt);
 }
 
-static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, int len)
+static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb)
 {
-	if (len < mss)
-		tp->snd_sml = tp->snd_nxt;
+	if (skb->len < mss)
+		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Return 0, if packet can be sent now without violation Nagle's rules:
@@ -1041,17 +1138,6 @@ static __inline__ int tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, u
 static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
 				   unsigned cur_mss, int tail)
 {
-	/*
-	 * Reset CWND after idle period longer RTO to "restart window".
-	 * It is "side" effect of the function, which is _not_ good
-	 * from viewpoint of clarity. But we have to make it before
-	 * checking congestion window below. Alternative is to prepend
-	 * all the calls with this test.
-	 */
-	if (tp->packets_out==0 &&
-	    (s32)(tcp_time_stamp - tp->lsndtime) > tp->rto)
-		tp->snd_cwnd = min(tp->snd_cwnd, tcp_init_cwnd(tp));
-
 	/*	RFC 1122 - section 4.2.3.4
 	 *
 	 *	We must queue if
@@ -1062,8 +1148,7 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
 	 *	   (part of SWS is done on packetization)
 	 *	   Minshall version sounds: there are no _small_
 	 *	   segments in flight. (tcp_nagle_check)
-	 *	c) We are retransmiting [Nagle]
-	 *	d) We have too many packets 'in flight'
+	 *	c) We have too many packets 'in flight'
 	 *
 	 * 	Don't use the nagle rule for urgent data (or
 	 *	for the final FIN -DaveM).
@@ -1081,13 +1166,12 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
 		 skb_tailroom(skb) < 32) &&
 		((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
 		 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
-		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
-		tp->retransmits == 0);
+		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
 }
 
 static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
 {
-	if (!tp->packets_out && !timer_pending(&tp->probe_timer))
+	if (!tp->packets_out && !tp->pending)
 		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
 }
 
@@ -1111,6 +1195,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
 		    tcp_write_xmit(sk))
 			tcp_check_probe_timer(sk, tp);
 	}
+	tcp_cwnd_validate(sk, tp);
 }
 
 static __inline__ void tcp_push_pending_frames(struct sock *sk,
@@ -1119,6 +1204,24 @@ static __inline__ void tcp_push_pending_frames(struct sock *sk,
 	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk));
 }
 
+static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
+{
+	struct sk_buff *skb = tp->send_head;
+
+	return (skb &&
+		tcp_snd_test(tp, skb, tcp_current_mss(sk), tcp_skb_is_last(sk, skb)));
+}
+
+static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+	tp->snd_wl1 = seq;
+}
+
+static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+{
+	tp->snd_wl1 = seq;
+}
+
 extern void			tcp_destroy_sock(struct sock *sk);
 
 
@@ -1143,7 +1246,6 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
 		__tcp_checksum_complete(skb);
 }
 
-
 /* Prequeue for VJ style copy to user, combined with checksumming. */
 
 static __inline__ void tcp_prequeue_init(struct tcp_opt *tp)
@@ -1167,12 +1269,15 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 	if (tp->ucopy.task) {
 		if ((tp->ucopy.memory += skb->truesize) <= (sk->rcvbuf<<1)) {
 			__skb_queue_tail(&tp->ucopy.prequeue, skb);
-			if (skb_queue_len(&tp->ucopy.prequeue) == 1)
+			if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 				wake_up_interruptible(sk->sleep);
+				if (!tcp_ack_scheduled(tp))
+					tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
+			}
 		} else {
 			NET_INC_STATS_BH(TCPPrequeueDropped);
 			tp->ucopy.memory -= skb->truesize;
-			kfree_skb(skb);
+			__kfree_skb(skb);
 		}
 		return 1;
 	}
@@ -1231,6 +1336,13 @@ static __inline__ void tcp_done(struct sock *sk)
 		tcp_destroy_sock(sk);
 }
 
+static __inline__ void tcp_sack_reset(struct tcp_opt *tp)
+{
+	tp->dsack = 0;
+	tp->eff_sacks = 0;
+	tp->num_sacks = 0;
+}
+
 static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp)
 {
 	if (tp->tstamp_ok) {
@@ -1241,17 +1353,22 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *
 		*ptr++ = htonl(tstamp);
 		*ptr++ = htonl(tp->ts_recent);
 	}
-	if(tp->sack_ok && tp->num_sacks) {
+	if (tp->eff_sacks) {
+		struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks;
 		int this_sack;
 
 		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
 					  (TCPOPT_NOP << 16) |
 					  (TCPOPT_SACK << 8) |
 					  (TCPOLEN_SACK_BASE +
-					   (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)));
-		for(this_sack = 0; this_sack < tp->num_sacks; this_sack++) {
-			*ptr++ = htonl(tp->selective_acks[this_sack].start_seq);
-			*ptr++ = htonl(tp->selective_acks[this_sack].end_seq);
+					   (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)));
+		for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) {
+			*ptr++ = htonl(sp[this_sack].start_seq);
+			*ptr++ = htonl(sp[this_sack].end_seq);
+		}
+		if (tp->dsack) {
+			tp->dsack = 0;
+			tp->eff_sacks--;
 		}
 	}
 }
@@ -1330,42 +1447,44 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss,
 			space >>= 1;
 			(*rcv_wscale)++;
 		}
+		if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
+		    space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2)
+			(*rcv_wscale)--;
+	}
+
+	/* Set initial window to value enough for senders,
+	 * following RFC1414. Senders, not following this RFC,
+	 * will be satisfied with 2.
+	 */
+	if (mss > (1<<*rcv_wscale)) {
+		int init_cwnd = 4;
+		if (mss > 1460*3)
+			init_cwnd = 2;
+		else if (mss > 1460)
+			init_cwnd = 3;
+		if (*rcv_wnd > init_cwnd*mss)
+			*rcv_wnd = init_cwnd*mss;
 	}
 	/* Set the clamp no higher than max representable value */
 	(*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp);
 }
 
+static inline int tcp_win_from_space(int space)
+{
+	return sysctl_tcp_adv_win_scale<=0 ?
+		(space>>(-sysctl_tcp_adv_win_scale)) :
+		space - (space>>sysctl_tcp_adv_win_scale);
+}
+
 /* Note: caller must be prepared to deal with negative returns */ 
 extern __inline__ int tcp_space(struct sock *sk)
 {
-	return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 
-		TCP_WINDOW_ADVERTISE_DIVISOR; 
+	return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc));
 } 
 
 extern __inline__ int tcp_full_space( struct sock *sk)
 {
-	return sk->rcvbuf / TCP_WINDOW_ADVERTISE_DIVISOR; 
-}
-
-extern __inline__ void tcp_init_buffer_space(struct sock *sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	int rcvbuf = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
-	int sndbuf = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff);
-
-	if (sk->rcvbuf < 3*rcvbuf)
-		sk->rcvbuf = min (3*rcvbuf, sysctl_rmem_max);
-
-	/* Reserve slack space to reduce jitter of advertised window. */
-	if (tp->window_clamp >= tcp_full_space(sk)) {
-		int nwin = tcp_full_space(sk) - tp->mss_clamp;
-
-		if (nwin >= MAX_TCP_WINDOW && nwin >= 2*tp->advmss)
-			tp->window_clamp = nwin;
-	}
-
-	if (sk->sndbuf < 3*sndbuf)
-		sk->sndbuf = min (3*sndbuf, sysctl_wmem_max);
+	return tcp_win_from_space(sk->rcvbuf); 
 }
 
 extern __inline__ void tcp_acceptq_removed(struct sock *sk)
@@ -1473,61 +1592,85 @@ static __inline__ void tcp_openreq_init(struct open_request *req,
 	req->snd_wscale = tp->snd_wscale;
 	req->wscale_ok = tp->wscale_ok;
 	req->acked = 0;
+	req->ecn_ok = 0;
 	req->rmt_port = skb->h.th->source;
 }
 
-extern const char timer_bug_msg[];
+#define TCP_MEM_QUANTUM	((int)PAGE_SIZE)
 
-static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
+static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	struct timer_list *timer;
-	
-	switch (what) {
-	case TCP_TIME_RETRANS:
-		timer = &tp->retransmit_timer;
-		break;
-	case TCP_TIME_DACK:
-		tp->ack.blocked = 0;
-		timer = &tp->delack_timer;
-		break;
-	case TCP_TIME_PROBE0:
-		timer = &tp->probe_timer;
-		break;	
-	default:
-		printk(timer_bug_msg);
-		return;
-	};
+	sk->tp_pinfo.af_tcp.queue_shrunk = 1;
+	sk->wmem_queued -= skb->truesize;
+	sk->forward_alloc += skb->truesize;
+	__kfree_skb(skb);
+}
 
-	if (timer_pending(timer) && del_timer(timer))
-		__sock_put(sk);
+static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb)
+{
+	sk->wmem_queued += skb->truesize;
+	sk->forward_alloc -= skb->truesize;
 }
 
-/* This function does not return reliable answer. Use it only as advice.
- */
+extern void __tcp_mem_reclaim(struct sock *sk);
+extern int tcp_mem_schedule(struct sock *sk, int size, int kind);
 
-static inline int tcp_timer_is_set(struct sock *sk, int what)
+static inline void tcp_mem_reclaim(struct sock *sk)
 {
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	int ret;
+	if (sk->forward_alloc >= TCP_MEM_QUANTUM)
+		__tcp_mem_reclaim(sk);
+}
 
-	switch (what) {
-	case TCP_TIME_RETRANS:
-		ret = timer_pending(&tp->retransmit_timer);
-		break;
-	case TCP_TIME_DACK:
-		ret = timer_pending(&tp->delack_timer);
-		break;
-	case TCP_TIME_PROBE0:
-		ret = timer_pending(&tp->probe_timer);
-		break;	
-	default:
-		ret = 0;
-		printk(timer_bug_msg);
-	};
-	return ret;
+static inline void tcp_enter_memory_pressure(void)
+{
+	if (!tcp_memory_pressure) {
+		NET_INC_STATS(TCPMemoryPressures);
+		tcp_memory_pressure = 1;
+	}
 }
 
+static inline void tcp_moderate_sndbuf(struct sock *sk)
+{
+	if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) {
+		sk->sndbuf = min(sk->sndbuf, sk->wmem_queued/2);
+		sk->sndbuf = max(sk->sndbuf, SOCK_MIN_SNDBUF);
+	}
+}
+
+static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp)
+{
+	struct sk_buff *skb = alloc_skb(size, gfp);
+
+	if (skb) {
+		if (sk->forward_alloc >= (int)skb->truesize ||
+		    tcp_mem_schedule(sk, skb->truesize, 0))
+			return skb;
+		__kfree_skb(skb);
+	} else {
+		tcp_enter_memory_pressure();
+		tcp_moderate_sndbuf(sk);
+	}
+	return NULL;
+}
+
+static inline void tcp_writequeue_purge(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->write_queue)) != NULL)
+		tcp_free_skb(sk, skb);
+	tcp_mem_reclaim(sk);
+}
+
+extern void tcp_rfree(struct sk_buff *skb);
+
+static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+	skb->sk = sk;
+	skb->destructor = tcp_rfree;
+	atomic_add(skb->truesize, &sk->rmem_alloc);
+	sk->forward_alloc -= skb->truesize;
+}
 
 extern void tcp_listen_wlock(void);
 
@@ -1570,28 +1713,30 @@ static inline int tcp_fin_time(struct tcp_opt *tp)
 	return fin_timeout;
 }
 
-#if 0 /* TCP_DEBUG */
-#define TCP_CHECK_TIMER(sk) \
-do { 	struct tcp_opt *__tp = &sk->tp_pinfo.af_tcp; \
-	if (sk->state != TCP_CLOSE) { \
-		if (__tp->packets_out) { \
-			if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS) && !timer_is_running(&__tp->retransmit_timer) && net_ratelimit()) \
-				printk(KERN_DEBUG "sk=%p RETRANS" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
-		} else if (__tp->send_head) { \
-			if (!tcp_timer_is_set(sk, TCP_TIME_PROBE0) && !timer_is_running(&__tp->probe_timer) && net_ratelimit()) \
-				printk(KERN_DEBUG "sk=%p PROBE0" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
-		} \
-	        if (__tp->ack.pending) { \
-			if (!tcp_timer_is_set(sk, TCP_TIME_DACK) && !timer_is_running(&__tp->delack_timer) && net_ratelimit()) \
-				printk(KERN_DEBUG "sk=%p DACK" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \
-		} \
-                if (__tp->packets_out > skb_queue_len(&sk->write_queue) || \
-		    (__tp->send_head && skb_queue_len(&sk->write_queue) == 0)) { \
-			 printk(KERN_DEBUG "sk=%p QUEUE" __FUNCTION__ "(%d) %d %d %d %p\n", sk, __LINE__, sk->state, __tp->packets_out, skb_queue_len(&sk->write_queue), __tp->send_head); \
-		} \
-	} } while (0)
-#else
+static inline int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+	if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+		return 0;
+	if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+		return 0;
+
+	/* RST segments are not recommended to carry timestamp,
+	   and, if they do, it is recommended to ignore PAWS because
+	   "their cleanup function should take precedence over timestamps."
+	   Certainly, it is mistake. It is necessary to understand the reasons
+	   of this constraint to relax it: if peer reboots, clock may go
+	   out-of-sync and half-open connections will not be reset.
+	   Actually, the problem would be not existing if all
+	   the implementations followed draft about maintaining clock
+	   via reboots. Linux-2.2 DOES NOT!
+
+	   However, we can relax time bounds for RST segments to MSL.
+	 */
+	if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+		return 0;
+	return 1;
+}
+
 #define TCP_CHECK_TIMER(sk) do { } while (0);
-#endif
 
 #endif	/* _TCP_H */
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
new file mode 100644
index 000000000..db004249f
--- /dev/null
+++ b/include/net/tcp_ecn.h
@@ -0,0 +1,155 @@
+#ifndef _NET_TCP_ECN_H_
+#define _NET_TCP_ECN_H_ 1
+
+#include <linux/config.h>
+
+#ifdef CONFIG_INET_ECN
+
+#include <net/inet_ecn.h>
+
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)|TCP_FLAG_ECE|TCP_FLAG_CWR)
+
+#define	TCP_ECN_OK		1
+#define TCP_ECN_QUEUE_CWR	2
+#define TCP_ECN_DEMAND_CWR	4
+
+static __inline__ void
+TCP_ECN_queue_cwr(struct tcp_opt *tp)
+{
+	if (tp->ecn_flags&TCP_ECN_OK)
+		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+
+/* Output functions */
+
+static __inline__ void
+TCP_ECN_send_synack(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+	if (!(tp->ecn_flags&TCP_ECN_OK))
+		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+}
+
+static __inline__ void
+TCP_ECN_send_syn(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	tp->ecn_flags = 0;
+	if (sysctl_tcp_ecn) {
+		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
+		tp->ecn_flags = TCP_ECN_OK;
+	}
+}
+
+static __inline__ void
+TCP_ECN_make_synack(struct open_request *req, struct tcphdr *th)
+{
+	if (req->ecn_ok)
+		th->ece = 1;
+}
+
+static __inline__ void
+TCP_ECN_send(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb, int tcp_header_len)
+{
+	if (tp->ecn_flags & TCP_ECN_OK) {
+		/* Not-retransmitted data segment: set ECT and inject CWR. */
+		if (skb->len != tcp_header_len &&
+		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+			INET_ECN_xmit(sk);
+			if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
+				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+				skb->h.th->cwr = 1;
+			}
+		} else {
+			/* ACK or retransmitted segment: clear ECT|CE */
+			INET_ECN_dontxmit(sk);
+		}
+		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+			skb->h.th->ece = 1;
+	}
+}
+
+/* Input functions */
+
+static __inline__ void
+TCP_ECN_accept_cwr(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	if (skb->h.th->cwr)
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static __inline__ void
+TCP_ECN_check_ce(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	if (tp->ecn_flags&TCP_ECN_OK) {
+		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		/* Funny extension: if ECT is not set on a segment,
+		 * it is surely retransmit. It is not in ECN RFC,
+		 * but Linux follows this rule. */
+		else if (!INET_ECN_is_capable((TCP_SKB_CB(skb)->flags)))
+			tcp_enter_quickack_mode(tp);
+	}
+}
+
+static __inline__ void
+TCP_ECN_rcv_synack(struct tcp_opt *tp, struct tcphdr *th)
+{
+	if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static __inline__ void
+TCP_ECN_rcv_syn(struct tcp_opt *tp, struct tcphdr *th)
+{
+	if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static __inline__ int
+TCP_ECN_rcv_ecn_echo(struct tcp_opt *tp, struct tcphdr *th)
+{
+	if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
+		return 1;
+	return 0;
+}
+
+static __inline__ void
+TCP_ECN_openreq_child(struct tcp_opt *tp, struct open_request *req)
+{
+	tp->ecn_flags = req->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+static __inline__ void
+TCP_ECN_create_request(struct open_request *req, struct tcphdr *th)
+{
+	if (sysctl_tcp_ecn && th->ece && th->cwr)
+		req->ecn_ok = 1;
+}
+
+
+
+#else
+
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+
+#define TCP_ECN_send_syn(x...)		do { } while (0)
+#define TCP_ECN_send_synack(x...)	do { } while (0)
+#define TCP_ECN_make_synack(x...)	do { } while (0)
+#define TCP_ECN_send(x...)		do { } while (0)
+
+#define TCP_ECN_queue_cwr(x...)		do { } while (0)
+
+#define TCP_ECN_accept_cwr(x...)	do { } while (0)
+#define TCP_ECN_check_ce(x...)		do { } while (0)
+#define TCP_ECN_rcv_synack(x...)	do { } while (0)
+#define TCP_ECN_rcv_syn(x...)		do { } while (0)
+#define TCP_ECN_rcv_ecn_echo(x...)	(0)
+#define TCP_ECN_openreq_child(x...)	do { } while (0)
+#define TCP_ECN_create_request(x...)	do { } while (0)
+
+
+#endif
+
+#endif