/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $
 *
 * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
 */

#include <linux/config.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
#include <linux/ipsec.h>

typedef void			(*tcp_sys_cong_ctl_t)(struct sock *sk,
						      u32 seq, u32 ack,
						      u32 seq_rtt);

static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack,
				u32 seq_rtt);
static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
				 u32 seq_rtt);

#ifdef CONFIG_SYSCTL
#define SYNC_INIT 0 /* let the user enable it */
#else
#define SYNC_INIT 1
#endif

extern int sysctl_tcp_fin_timeout;

/* These are on by default so the code paths get tested.
 * For the final 2.2 this may be undone at our discretion. -DaveM
 */
int sysctl_tcp_timestamps = 1;
int sysctl_tcp_window_scaling = 1;
int sysctl_tcp_sack = 1;
int sysctl_tcp_hoe_retransmits = 1;

int sysctl_tcp_cong_avoidance;
int sysctl_tcp_syncookies = SYNC_INIT; 
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;

static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
static void tcp_delack_estimator(struct tcp_opt *tp)
{
	if(tp->ato == 0) {
		tp->lrcvtime = jiffies;

		/* Help sender leave slow start quickly,
		 * this sets our initial ato value.
		 */
		tcp_enter_quickack_mode(tp);
	} else {
		int m = jiffies - tp->lrcvtime;

		tp->lrcvtime = jiffies;
		if(m <= 0)
			m = 1;
		if(m > tp->rto)
			tp->ato = tp->rto;
		else
			tp->ato = (tp->ato >> 1) + m;

		/* We are not in "quick ack" mode. */
		if(tp->ato <= (HZ/100))
			tp->ato = ((HZ/100)*2);
	}
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */

static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
	long m = mrtt; /* RTT */

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
	 *	This is designed to be as fast as possible 
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 */
	if(m == 0)
		m = 1;
	if (tp->srtt != 0) {
		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
		if (m < 0)
			m = -m;		/* m is now abs(error) */
		m -= (tp->mdev >> 2);   /* similar update on mdev */
		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
	} else {
		/* no previous measure. */
		tp->srtt = m<<3;	/* take the measured time to be rtt */
		tp->mdev = m<<2;	/* make sure rto = 3*rtt */
	}
}

/* Calculate rto without backoff. This is the second half of Van Jacobsons
 * routine refered to above.
 */

static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
	tp->rto = (tp->srtt >> 3) + tp->mdev;
	tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
}
 

/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 * on packet lifetime in the internet. We need the HZ/5 lower
 * bound to behave correctly against BSD stacks with a fixed
 * delayed ack.
 * FIXME: It's not entirely clear this lower bound is the best
 * way to avoid the problem. Is it possible to drop the lower
 * bound and still avoid trouble with BSD stacks? Perhaps
 * some modification to the RTO calculation that takes delayed
 * ack bais into account? This needs serious thought. -- erics
 */
static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
{
	if (tp->rto > 120*HZ)
		tp->rto = 120*HZ;
	if (tp->rto < HZ/5)
		tp->rto = HZ/5;
}

/* WARNING: this must not be called if tp->saw_timestamp was false. */
extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
{
	/* From draft-ietf-tcplw-high-performance: the correct
	 * test is last_ack_sent <= end_seq.
	 * (RFC1323 stated last_ack_sent < end_seq.)
	 */
	if (!before(end_seq,tp->last_ack_sent)) {
		tp->ts_recent = tp->rcv_tsval;
		tp->ts_recent_stamp = jiffies;
	}
}

#define PAWS_24DAYS	(HZ * 60 * 60 * 24 * 24)

extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
{
	/* ts_recent must be younger than 24 days */
	return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
		((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
}


static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
	u32 end_window = tp->rcv_wup + tp->rcv_wnd;

	if (tp->rcv_wnd) {
		if (!before(seq, tp->rcv_nxt) && before(seq, end_window))
			return 1;

		if ((end_seq - seq) && after(end_seq, tp->rcv_nxt) &&
		    !after(end_seq, end_window))
			return 1;
	}

	return 0;
}

/* This functions checks to see if the tcp header is actually acceptable. */
extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
	if (seq == tp->rcv_nxt)
		return (tp->rcv_wnd || (end_seq == seq));

	return __tcp_sequence(tp, seq, end_seq);
}

/* When we get a reset we do this. */
static void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
	sk->zapped = 1;

	/* We want the right error as BSD sees it (and indeed as we do). */
	switch (sk->state) {
		case TCP_SYN_SENT:
			sk->err = ECONNREFUSED;
			break;
		case TCP_CLOSE_WAIT:
			sk->err = EPIPE;
			break;
		default:
			sk->err = ECONNRESET;
	};
	tcp_set_state(sk,TCP_CLOSE);
	sk->shutdown = SHUTDOWN_MASK;
	if (!sk->dead) 
		sk->state_change(sk);
}

/* This tags the retransmission queue when SACKs arrive. */
static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int i = nsacks;

	while(i--) {
		struct sk_buff *skb = skb_peek(&sk->write_queue);
		__u32 start_seq = ntohl(sp->start_seq);
		__u32 end_seq = ntohl(sp->end_seq);

		while((skb != NULL) &&
		      (skb != tp->send_head) &&
		      (skb != (struct sk_buff *)&sk->write_queue)) {
			/* We play conservative, we don't allow SACKS to partially
			 * tag a sequence space.
			 */
			if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq))
				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
			skb = skb->next;
		}
		sp++; /* Move on to the next SACK block. */
	}
}

/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
{
	unsigned char *ptr;
	int length=(th->doff*4)-sizeof(struct tcphdr);

	ptr = (unsigned char *)(th + 1);
	tp->saw_tstamp = 0;

	while(length>0) {
	  	int opcode=*ptr++;
		int opsize;

		switch (opcode) {
			case TCPOPT_EOL:
				return;
			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
				length--;
				continue;
			default:
				opsize=*ptr++;
				if (opsize < 2) /* "silly options" */
					return;
				if (opsize > length)
					break;	/* don't parse partial options */
	  			switch(opcode) {
				case TCPOPT_MSS:
					if(opsize==TCPOLEN_MSS && th->syn) {
						tp->in_mss = ntohs(*(__u16 *)ptr);
						if (tp->in_mss == 0)
							tp->in_mss = 536;
					}
					break;
				case TCPOPT_WINDOW:
					if(opsize==TCPOLEN_WINDOW && th->syn)
						if (!no_fancy && sysctl_tcp_window_scaling) {
							tp->wscale_ok = 1;
							tp->snd_wscale = *(__u8 *)ptr;
						}
					break;
				case TCPOPT_TIMESTAMP:
					if(opsize==TCPOLEN_TIMESTAMP) {
						if (sysctl_tcp_timestamps && !no_fancy) {
							tp->tstamp_ok = 1;
							tp->saw_tstamp = 1;
							tp->rcv_tsval = ntohl(*(__u32 *)ptr);
							tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
						}
					}
					break;
				case TCPOPT_SACK_PERM:
					if(opsize==TCPOLEN_SACK_PERM && th->syn) {
						if (sysctl_tcp_sack && !no_fancy) {
							tp->sack_ok = 1;
							tp->num_sacks = 0;
						}
					}
					break;

				case TCPOPT_SACK:
					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
					   sysctl_tcp_sack && (sk != NULL) && !th->syn) {
						int sack_bytes = opsize - TCPOLEN_SACK_BASE;

						if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
							int num_sacks = sack_bytes >> 3;
							struct tcp_sack_block *sackp;

							sackp = (struct tcp_sack_block *)ptr;
							tcp_sacktag_write_queue(sk, sackp, num_sacks);
						}
					}
	  			};
	  			ptr+=opsize-2;
	  			length-=opsize;
	  	};
	}
}

/* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
{
	/* If we didn't send out any options ignore them all. */
	if (tp->tcp_header_len == sizeof(struct tcphdr))
		return 0;
	if (th->doff == sizeof(struct tcphdr)>>2) {
		tp->saw_tstamp = 0;
		return 0;
	} else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
		__u32 *ptr = (__u32 *)(th + 1);
		if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
					     | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
			tp->saw_tstamp = 1;
			tp->rcv_tsval = ntohl(*++ptr);
			tp->rcv_tsecr = ntohl(*++ptr);
			return 1;
		}
	}
	tcp_parse_options(sk, th, tp, 0);
	return 1;
}

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/

static __inline__ void clear_fast_retransmit(struct sock *sk)
{
	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);

	if (tp->dup_acks > 3) {
		tp->retrans_head = NULL;
		tp->snd_cwnd = max(tp->snd_ssthresh, 1);
	}
	tp->dup_acks = 0;
}

/* NOTE: This code assumes that tp->dup_acks gets cleared when a
 * retransmit timer fires.
 */
static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
{
	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);

	/*
	 * Note: If not_dup is set this implies we got a
	 * data carrying packet or a window update.
	 * This carries no new information about possible
	 * lost packets, so we have to ignore it for the purposes
	 * of counting duplicate acks. Ideally this does not imply we
	 * should stop our fast retransmit phase, more acks may come
	 * later without data to help us. Unfortunately this would make
	 * the code below much more complex. For now if I see such
	 * a packet I clear the fast retransmit phase.
	 */

	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
		/* This is the standard reno style fast retransmit branch. */

                /* 1. When the third duplicate ack is received, set ssthresh 
                 * to one half the current congestion window, but no less 
                 * than two segments. Retransmit the missing segment.
                 */
		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
			tp->dup_acks++;
			if (tp->dup_acks == 3) {
				tp->dup_acks++;
                                tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
                                tp->snd_cwnd = tp->snd_ssthresh + 3;
				tp->high_seq = tp->snd_nxt;
                                tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
			}
		}

                /* 2. Each time another duplicate ACK arrives, increment 
                 * cwnd by the segment size. [...] Transmit a packet...
                 *
                 * Packet transmission will be done on normal flow processing
                 * since we're not in "retransmit mode"
                 */
                if (tp->dup_acks > 3)
                        tp->snd_cwnd++;
	} else if (tp->high_seq != 0) {
		/* In this branch we deal with clearing the Floyd style
		 * block on duplicate fast retransmits, and if requested
		 * we do Hoe style secondary fast retransmits.
		 */
		if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
			/* Once we have acked all the packets up to high_seq
			 * we are done this fast retransmit phase.
			 * Alternatively data arrived. In this case we
			 * Have to abort the fast retransmit attempt.
			 * Note that we do want to accept a window
			 * update since this is expected with Hoe's algorithm.
			 */
			clear_fast_retransmit(sk);

			/* After we have cleared up to high_seq we can
			 * clear the Floyd style block.
			 */
			if (after(ack, tp->high_seq))
				tp->high_seq = 0;
		} else if (tp->dup_acks >= 3) {
			if (sysctl_tcp_hoe_retransmits) {
				/* Hoe Style. We didn't ack the whole
				 * window. Take this as a cue that
				 * another packet was lost and retransmit it.
				 * Don't muck with the congestion window here.
				 * Note that we have to be careful not to
				 * act if this was a window update and it
				 * didn't ack new data, since this does
				 * not indicate a packet left the system.
				 * We can test this by just checking
				 * if ack changed from snd_una, since
				 * the only way to get here without advancing
				 * from snd_una is if this was a window update.
				 */
				if (ack != tp->snd_una && before(ack, tp->high_seq)) {
                                	tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
                                	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
				}
			} else {
				/* Reno style. We didn't ack the whole
				 * window, now we have to drop out of
				 * fast retransmit and wait for a timeout.
				 */
				clear_fast_retransmit(sk);
			}
		}
	}
}

/*
 *      TCP slow start and congestion avoidance in two flavors:
 *      RFC 1122 and TCP Vegas.
 *
 *      This is a /proc/sys configurable option. 
 */

#define SHIFT_FACTOR 16

static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
				 u32 seq_rtt)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	unsigned int actual, expected;
	unsigned int inv_rtt, inv_basertt, inv_basebd;
	u32 snt_bytes;

	/*	From:
	 *      TCP Vegas: New Techniques for Congestion 
	 *	Detection and Avoidance.
	 *
	 *	Warning: This code is a scratch implementation taken
	 *	from the paper only. The code they distribute seams
	 *	to have improved several things over the initial spec.
	 */

	if (!seq_rtt)
		seq_rtt = 1;

	if (tp->basertt)
		tp->basertt = min(seq_rtt, tp->basertt);
	else
		tp->basertt = seq_rtt;

	/*	actual	 = throughput for this segment.
	 *	expected = number_of_bytes in transit / BaseRTT
	 */

	snt_bytes = ack - seq;

	inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt;
	inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt;

	actual =  snt_bytes * inv_rtt;

	expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;

	inv_basebd = sk->mss * inv_basertt;

	/* Slow Start */
	if (tp->snd_cwnd < tp->snd_ssthresh &&
	    (seq == tp->snd_nxt ||
	     (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) {
		/* "Vegas allows exponential growth only every other RTT" */
		if (tp->snd_cwnd_cnt++) {
			tp->snd_cwnd++;
			tp->snd_cwnd_cnt = 0;
		}
	} else {
		/* Congestion Avoidance */
		if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) {
			/* Increase Linearly */
			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
				tp->snd_cwnd++;
				tp->snd_cwnd_cnt = 0;
			}
		}

		if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) {
			/* Decrease Linearly */
			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
				tp->snd_cwnd--;
				tp->snd_cwnd_cnt = 0;
			}

			/* Never less than 2 segments. */
			if (tp->snd_cwnd < 2)
				tp->snd_cwnd = 2;
		}
	}
}

static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	
        /* This is Jacobson's slow start and congestion avoidance. 
         * SIGCOMM '88, p. 328.  Because we keep cong_window in 
         * integral mss's, we can't do cwnd += 1 / cwnd.  
         * Instead, maintain a counter and increment it once every 
         * cwnd times.  
	 * FIXME: Check to be sure the mathematics works out right
	 * on this trick when we have to reduce the congestion window.
	 * The snd_cwnd_cnt has to be reset properly when reduction events
	 * happen.
	 * FIXME: What happens when the congestion window gets larger
	 * than the maximum receiver window by some large factor
	 * Suppose the pipeline never looses packets for a long
	 * period of time, then traffic increases causing packet loss.
	 * The congestion window should be reduced, but what it should
	 * be reduced to is not clear, since 1/2 the old window may
	 * still be larger than the maximum sending rate we ever achieved.
         */
        if (tp->snd_cwnd <= tp->snd_ssthresh) {
                /* In "safe" area, increase. */
                tp->snd_cwnd++;
	} else {
                /* In dangerous area, increase slowly.  In theory this is
                 * tp->snd_cwnd += 1 / tp->snd_cwnd
                 */
                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
                        tp->snd_cwnd++;
                        tp->snd_cwnd_cnt = 0;
                } else 
                        tp->snd_cwnd_cnt++;
        }       
}

/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
			       __u32 *seq, __u32 *seq_rtt)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sk_buff *skb;
	unsigned long now = jiffies;
	int acked = 0;

	while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
		/* If our packet is before the ack sequence we can
		 * discard it as it's confirmed to have arrived at
		 * the other end.
		 */
		if (after(skb->end_seq, ack))
			break;

		/* Initial outgoing SYN's get put onto the write_queue
		 * just like anything else we transmit.  It is not
		 * true data, and if we misinform our callers that
		 * this ACK acks real data, we will erroneously exit
		 * connection startup slow start one packet too
		 * quickly.  This is severely frowned upon behavior.
		 */
		if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
			acked |= FLAG_DATA_ACKED;
			if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				acked |= FLAG_RETRANS_DATA_ACKED;
		} else {
			tp->retrans_head = NULL;
		}		
		tp->packets_out--;
		*seq = skb->seq;
		*seq_rtt = now - skb->when;
		skb_unlink(skb);
		kfree_skb(skb);
	}

	if (acked)
		tp->retrans_head = NULL;
	return acked;
}

static void tcp_ack_probe(struct sock *sk, __u32 ack)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	
	/* Our probe was answered. */
	tp->probes_out = 0;
	
	/* Was it a usable window open? */
	
	/* should always be non-null */
	if (tp->send_head != NULL &&
	    !before (ack + tp->snd_wnd, tp->send_head->end_seq)) {
		tp->backoff = 0;
		tp->pending = 0;
		tcp_clear_xmit_timer(sk, TIME_PROBE0);
	} else {
		tcp_reset_xmit_timer(sk, TIME_PROBE0,
				     min(tp->rto << tp->backoff, 120*HZ));
	}
}
 
/* Read draft-ietf-tcplw-high-performance before mucking
 * with this code. (Superceeds RFC1323)
 */
static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
			       u32 seq, u32 ack, int flag)
{
	__u32 seq_rtt = (jiffies-tp->rcv_tsecr);
	tcp_rtt_estimator(tp, seq_rtt);
	if (tp->retransmits) {
		if (tp->packets_out == 0) {
			tp->retransmits = 0;
			tp->backoff = 0;
			tcp_set_rto(tp);
		} else {
			/* Still retransmitting, use backoff */
			tcp_set_rto(tp);
			tp->rto = tp->rto << tp->backoff;
		}
	} else {
		tcp_set_rto(tp);
		if (flag & FLAG_DATA_ACKED)
			(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
	}
	/* NOTE: safe here so long as cong_ctl doesn't use rto */
	tcp_bound_rto(tp);
}

static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
	struct sk_buff *skb = skb_peek(&sk->write_queue);
	long when = tp->rto - (jiffies - skb->when);

	/* Some data was ACK'd, if still retransmitting (due to a
	 * timeout), resend more of the retransmit queue.  The
	 * congestion window is handled properly by that code.
	 */
	if (tp->retransmits) {
		tp->retrans_head = NULL;
		tcp_xmit_retransmit_queue(sk);
		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
	} else {
		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
	}
}

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct tcphdr *th, 
		   u32 ack_seq, u32 ack, int len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int flag = 0;
	u32 seq = 0;
	u32 seq_rtt = 0;

	if(sk->zapped)
		return(1);	/* Dead, can't ack any more so why bother */

	if (tp->pending == TIME_KEEPOPEN)
	  	tp->probes_out = 0;

	tp->rcv_tstamp = jiffies;

	/* If the ack is newer than sent or older than previous acks
	 * then we can probably ignore it.
	 */
	if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
		goto uninteresting_ack;

	dst_confirm(sk->dst_cache);

	/* If there is data set flag 1 */
	if (len != th->doff*4) {
		flag |= FLAG_DATA;
		tcp_delack_estimator(tp);
	}

	/* Update our send window. */

	/* This is the window update code as per RFC 793
	 * snd_wl{1,2} are used to prevent unordered
	 * segments from shrinking the window 
	 */
	if (before(tp->snd_wl1, ack_seq) ||
	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
		u32 nwin = ntohs(th->window) << tp->snd_wscale;

		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
			flag |= FLAG_WIN_UPDATE;
			tp->snd_wnd = nwin;

			tp->snd_wl1 = ack_seq;
			tp->snd_wl2 = ack;

			if (nwin > tp->max_window)
				tp->max_window = nwin;
		}
	}

	/* We passed data and got it acked, remove any soft error
	 * log. Something worked...
	 */
	sk->err_soft = 0;

	/* If this ack opens up a zero window, clear backoff.  It was
	 * being used to time the probes, and is probably far higher than
	 * it needs to be for normal retransmission.
	 */
	if (tp->pending == TIME_PROBE0)
		tcp_ack_probe(sk, ack);

	/* See if we can take anything off of the retransmit queue. */
	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);

	/* If we have a timestamp, we always do rtt estimates. */
	if (tp->saw_tstamp) {
		tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
	} else {
		/* If we were retransmiting don't count rtt estimate. */
		if (tp->retransmits) {
			if (tp->packets_out == 0)
				tp->retransmits = 0;
		} else {
			/* We don't have a timestamp. Can only use
			 * packets that are not retransmitted to determine
			 * rtt estimates. Also, we must not reset the
			 * backoff for rto until we get a non-retransmitted
			 * packet. This allows us to deal with a situation
			 * where the network delay has increased suddenly.
			 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
			 */
			if ((flag & FLAG_DATA_ACKED) &&
			    !(flag & FLAG_RETRANS_DATA_ACKED)) {
				tp->backoff = 0;
				tcp_rtt_estimator(tp, seq_rtt);
				tcp_set_rto(tp);
				tcp_bound_rto(tp);
				(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
			}
		}
	}

	if (tp->packets_out) {
		if (flag & FLAG_DATA_ACKED)
			tcp_ack_packets_out(sk, tp);
	} else {
		tcp_clear_xmit_timer(sk, TIME_RETRANS);
	}

	flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
	if ((ack == tp->snd_una	&& tp->packets_out && flag == 0) ||
	    (tp->high_seq != 0)) {
		tcp_fast_retrans(sk, ack, flag);
	} else {
		/* Clear any aborted fast retransmit starts. */
		tp->dup_acks = 0;
	}
	/* Remember the highest ack received. */
	tp->snd_una = ack;
	return 1;

uninteresting_ack:
	SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
	return 0;
}

/* New-style handling of TIME_WAIT sockets. */
static void tcp_timewait_kill(unsigned long __arg)
{
	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg;

	/* Zap the timer. */
	del_timer(&tw->timer);

	/* Unlink from various places. */
	if(tw->bind_next)
		tw->bind_next->bind_pprev = tw->bind_pprev;
	*(tw->bind_pprev) = tw->bind_next;
	if(tw->tb->owners == NULL)
		tcp_inc_slow_timer(TCP_SLT_BUCKETGC);

	if(tw->next)
		tw->next->pprev = tw->pprev;
	*tw->pprev = tw->next;

	/* We decremented the prot->inuse count when we entered TIME_WAIT
	 * and the sock from which this came was destroyed.
	 */
	tw->sklist_next->sklist_prev = tw->sklist_prev;
	tw->sklist_prev->sklist_next = tw->sklist_next;

	/* Ok, now free it up. */
	kmem_cache_free(tcp_timewait_cachep, tw);
}

/* We come here as a special case from the AF specific TCP input processing,
 * and the SKB has no owner.  Essentially handling this is very simple,
 * we just keep silently eating rx'd packets until none show up for the
 * entire timeout period.  The only special cases are for BSD TIME_WAIT
 * reconnects and SYN/RST bits being set in the TCP header.
 */
int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
			       struct tcphdr *th, void *opt, __u16 len)
{
	/*	RFC 1122:
	 *	"When a connection is [...] on TIME-WAIT state [...]
	 *	[a TCP] MAY accept a new SYN from the remote TCP to
	 *	reopen the connection directly, if it:
	 *	
	 *	(1)  assigns its initial sequence number for the new
	 *	connection to be larger than the largest sequence
	 *	number it used on the previous connection incarnation,
	 *	and
	 *
	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
	 *	to be an old duplicate".
	 */
	if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
		struct sock *sk;
		struct tcp_func *af_specific = tw->af_specific;
		__u32 isn;

		isn = tw->rcv_nxt + 128000;
		if(isn == 0)
			isn++;
		tcp_timewait_kill((unsigned long)tw);
		sk = af_specific->get_sock(skb, th);
		if(sk == NULL || !ipsec_sk_policy(sk,skb))
			return 0;
		skb_set_owner_r(skb, sk);
		af_specific = sk->tp_pinfo.af_tcp.af_specific;
		if(af_specific->conn_request(sk, skb, opt, isn) < 0)
			return 1; /* Toss a reset back. */
		return 0; /* Discard the frame. */
	}

	/* Check RST or SYN */
	if(th->rst || th->syn) {
		/* This is TIME_WAIT assasination, in two flavors.
		 * Oh well... nobody has a sufficient solution to this
		 * protocol bug yet.
		 */
		if(sysctl_tcp_rfc1337 == 0)
			tcp_timewait_kill((unsigned long)tw);

		if(!th->rst)
			return 1; /* toss a reset back */
	} else {
		if(th->ack) {
			/* In this case we must reset the TIMEWAIT timer. */
			mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN);
		}
	}
	return 0; /* Discard the frame. */
}

/* Enter the time wait state.  This is always called from BH
 * context.  Essentially we whip up a timewait bucket, copy the
 * relevant info into it from the SK, and mess with hash chains
 * and list linkage.
 */
static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
{
	struct sock **head, *sktw;

	/* Step 1: Remove SK from established hash. */
	if(sk->next)
		sk->next->pprev = sk->pprev;
	*sk->pprev = sk->next;
	sk->pprev = NULL;
	tcp_reg_zap(sk);

	/* Step 2: Put TW into bind hash where SK was. */
	tw->tb = (struct tcp_bind_bucket *)sk->prev;
	if((tw->bind_next = sk->bind_next) != NULL)
		sk->bind_next->bind_pprev = &tw->bind_next;
	tw->bind_pprev = sk->bind_pprev;
	*sk->bind_pprev = (struct sock *)tw;

	/* Step 3: Same for the protocol sklist. */
	(tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
	(tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
	sk->sklist_next = NULL;
	sk->prot->inuse--;

	/* Step 4: Hash TW into TIMEWAIT half of established hash table. */
	head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
	sktw = (struct sock *)tw;
	if((sktw->next = *head) != NULL)
		(*head)->pprev = &sktw->next;
	*head = sktw;
	sktw->pprev = head;
}

void tcp_time_wait(struct sock *sk)
{
	struct tcp_tw_bucket *tw;

	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
	if(tw != NULL) {
		/* Give us an identity. */
		tw->daddr	= sk->daddr;
		tw->rcv_saddr	= sk->rcv_saddr;
		tw->bound_dev_if= sk->bound_dev_if;
		tw->num		= sk->num;
		tw->state	= TCP_TIME_WAIT;
		tw->sport	= sk->sport;
		tw->dport	= sk->dport;
		tw->family	= sk->family;
		tw->reuse	= sk->reuse;
		tw->rcv_nxt	= sk->tp_pinfo.af_tcp.rcv_nxt;
		tw->af_specific	= sk->tp_pinfo.af_tcp.af_specific;

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
		if(tw->family == AF_INET6) {
			memcpy(&tw->v6_daddr,
			       &sk->net_pinfo.af_inet6.daddr,
			       sizeof(struct in6_addr));
			memcpy(&tw->v6_rcv_saddr,
			       &sk->net_pinfo.af_inet6.rcv_saddr,
			       sizeof(struct in6_addr));
		}
#endif
		/* Linkage updates. */
		tcp_tw_hashdance(sk, tw);

		/* Get the TIME_WAIT timeout firing. */
		init_timer(&tw->timer);
		tw->timer.function = tcp_timewait_kill;
		tw->timer.data = (unsigned long) tw;
		tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
		add_timer(&tw->timer);

		/* CLOSE the SK. */
		if(sk->state == TCP_ESTABLISHED)
			tcp_statistics.TcpCurrEstab--;
		sk->state = TCP_CLOSE;
		net_reset_timer(sk, TIME_DONE,
				min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
	} else {
		/* Sorry, we're out of memory, just CLOSE this
		 * socket up.  We've got bigger problems than
		 * non-graceful socket closings.
		 */
		tcp_set_state(sk, TCP_CLOSE);
	}

	/* Prevent rcvmsg/sndmsg calls, and wake people up. */
	sk->shutdown = SHUTDOWN_MASK;
	if(!sk->dead)
		sk->state_change(sk);
}

/*
 * 	Process the FIN bit. This now behaves as it is supposed to work
 *	and the FIN takes effect when it is validly part of sequence
 *	space. Not before when we get holes.
 *
 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
 *	TIME-WAIT)
 *
 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
 *	close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
 
static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
	sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;

	tcp_send_ack(sk);

	if (!sk->dead) {
		sk->state_change(sk);
		sock_wake_async(sk->socket, 1);
	}

	switch(sk->state) {
		case TCP_SYN_RECV:
		case TCP_ESTABLISHED:
			/* Move to CLOSE_WAIT */
			tcp_set_state(sk, TCP_CLOSE_WAIT);
			if (th->rst)
				sk->shutdown = SHUTDOWN_MASK;
			break;

		case TCP_CLOSE_WAIT:
		case TCP_CLOSING:
			/* Received a retransmission of the FIN, do
			 * nothing.
			 */
			break;
		case TCP_LAST_ACK:
			/* RFC793: Remain in the LAST-ACK state. */
			break;

		case TCP_FIN_WAIT1:
			/* This case occurs when a simultaneous close
			 * happens, we must ack the received FIN and
			 * enter the CLOSING state.
			 *
			 * This causes a WRITE timeout, which will either
			 * move on to TIME_WAIT when we timeout, or resend
			 * the FIN properly (maybe we get rid of that annoying
			 * FIN lost hang). The TIME_WRITE code is already 
			 * correct for handling this timeout.
			 */
			tcp_set_state(sk, TCP_CLOSING);
			break;
		case TCP_FIN_WAIT2:
			/* Received a FIN -- send ACK and enter TIME_WAIT. */
			tcp_time_wait(sk);
			break;
		default:
			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
			 * cases we should never reach this piece of code.
			 */
			printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
			break;
	};
}

/* These routines update the SACK block as out-of-order packets arrive or
 * in-order packets close up the sequence space.
 */
static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
{
	int this_sack, num_sacks = tp->num_sacks;
	struct tcp_sack_block *swalk = &tp->selective_acks[0];

	/* If more than one SACK block, see if the recent change to SP eats into
	 * or hits the sequence space of other SACK blocks, if so coalesce.
	 */
	if(num_sacks != 1) {
		for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
			if(swalk == sp)
				continue;

			/* First case, bottom of SP moves into top of the
			 * sequence space of SWALK.
			 */
			if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
				sp->start_seq = swalk->start_seq;
				goto coalesce;
			}
			/* Second case, top of SP moves into bottom of the
			 * sequence space of SWALK.
			 */
			if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
				sp->end_seq = swalk->end_seq;
				goto coalesce;
			}
		}
	}
	/* SP is the only SACK, or no coalescing cases found. */
	return;

coalesce:
	/* Zap SWALK, by moving every further SACK up by one slot.
	 * Decrease num_sacks.
	 */
	for(this_sack += 1; this_sack < num_sacks; this_sack++, swalk++) {
		struct tcp_sack_block *next = (swalk + 1);
		swalk->start_seq = next->start_seq;
		swalk->end_seq = next->end_seq;
	}
	tp->num_sacks--;
}

static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
{
	__u32 tmp;

	tmp = sack1->start_seq;
	sack1->start_seq = sack2->start_seq;
	sack2->start_seq = tmp;

	tmp = sack1->end_seq;
	sack1->end_seq = sack2->end_seq;
	sack2->end_seq = tmp;
}

static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct tcp_sack_block *sp = &tp->selective_acks[0];

	/* Optimize for the common case, new ofo frames arrive
	 * "in order". ;-)  This also satisfies the requirements
	 * of RFC2018 about ordering of SACKs.
	 */
	if(sp->end_seq == skb->seq) {
		sp->end_seq = skb->end_seq;
		tcp_sack_maybe_coalesce(tp, sp);
	} else if(sp->start_seq == skb->end_seq) {
		/* Re-ordered arrival, in this case, can be optimized
		 * as well.
		 */
		sp->start_seq = skb->seq;
		tcp_sack_maybe_coalesce(tp, sp);
	} else {
		int cur_sacks = tp->num_sacks;
		int max_sacks = (tp->tstamp_ok ? 3 : 4);

		/* Oh well, we have to move things around.
		 * Try to find a SACK we can tack this onto.
		 */
		if(cur_sacks > 1) {
			struct tcp_sack_block *swap = sp + 1;
			int this_sack;

			for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
				if((swap->end_seq == skb->seq) ||
				   (swap->start_seq == skb->end_seq)) {
					if(swap->end_seq == skb->seq)
						swap->end_seq = skb->end_seq;
					else
						swap->start_seq = skb->seq;
					tcp_sack_swap(sp, swap);
					tcp_sack_maybe_coalesce(tp, sp);
					return;
				}
			}
		}

		/* Could not find an adjacent existing SACK, build a new one,
		 * put it at the front, and shift everyone else down.  We
		 * always know there is at least one SACK present already here.
		 */
		while(cur_sacks >= 1) {
			struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
			struct tcp_sack_block *prev = (this - 1);
			this->start_seq = prev->start_seq;
			this->end_seq = prev->end_seq;
			cur_sacks--;
		}

		/* Build head SACK, and we're done. */
		sp->start_seq = skb->seq;
		sp->end_seq = skb->end_seq;
		if(tp->num_sacks < max_sacks)
			tp->num_sacks++;
	}
}

static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
{
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int num_sacks = tp->num_sacks;
	int this_sack;

	/* We know this removed SKB will eat from the front of a SACK. */
	for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
		if(sp->start_seq == skb->seq)
			break;
	}

	/* This should only happen if so many SACKs get built that some get
	 * pushed out before we get here, or we eat some in sequence packets
	 * which are before the first SACK block.
	 */
	if(this_sack >= num_sacks)
		return;

	sp->start_seq = skb->end_seq;
	if(!before(sp->start_seq, sp->end_seq)) {
		/* Zap this SACK, by moving forward any other SACKS. */
		for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
			struct tcp_sack_block *next = (sp + 1);
			sp->start_seq = next->start_seq;
			sp->end_seq = next->end_seq;
		}
		tp->num_sacks--;
	}
}

static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
{
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int num_sacks = tp->num_sacks;
	int this_sack;

	for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) {
		if(sp->end_seq == old_skb->end_seq)
			break;
	}
	if(this_sack >= num_sacks)
		return;
	sp->end_seq = new_skb->end_seq;
}

/* This one checks to see if we can put data from the
 * out_of_order queue into the receive_queue.
 */
static void tcp_ofo_queue(struct sock *sk)
{
	struct sk_buff *skb;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	while ((skb = skb_peek(&tp->out_of_order_queue))) {
		if (after(skb->seq, tp->rcv_nxt))
			break;

		if (!after(skb->end_seq, tp->rcv_nxt)) {
			SOCK_DEBUG(sk, "ofo packet was already received \n");
			skb_unlink(skb);
			kfree_skb(skb);
			continue;
		}
		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
			   tp->rcv_nxt, skb->seq, skb->end_seq);

		if(tp->sack_ok)
			tcp_sack_remove_skb(tp, skb);
		skb_unlink(skb);
		skb_queue_tail(&sk->receive_queue, skb);
		tp->rcv_nxt = skb->end_seq;
		if(skb->h.th->fin)
			tcp_fin(skb, sk, skb->h.th);
	}
}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
	struct sk_buff *skb1;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/*  Queue data for delivery to the user.
	 *  Packets in sequence go to the receive queue.
	 *  Out of sequence packets to out_of_order_queue.
	 */
	if (skb->seq == tp->rcv_nxt) {
		/* Ok. In sequence. */
	queue_and_out:
		dst_confirm(sk->dst_cache);
		skb_queue_tail(&sk->receive_queue, skb);
		tp->rcv_nxt = skb->end_seq;
		if(skb->h.th->fin) {
			tcp_fin(skb, sk, skb->h.th);
		} else {
			tp->delayed_acks++;

			/* Tiny-grams with PSH set make us ACK quickly. */
			if(skb->h.th->psh && (skb->len < (sk->mss >> 1)))
				tp->ato = HZ/50;
		}
		/* This may have eaten into a SACK block. */
		if(tp->sack_ok && tp->num_sacks)
			tcp_sack_remove_skb(tp, skb);
		tcp_ofo_queue(sk);
		if (skb_queue_len(&tp->out_of_order_queue) == 0)
			tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
					       (0x10 << 16) |
					       tp->snd_wnd);
		return;
	}
	
	/* An old packet, either a retransmit or some packet got lost. */
	if (!after(skb->end_seq, tp->rcv_nxt)) {
		/* A retransmit, 2nd most common case.  Force an imediate ack. */
		SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
		tcp_enter_quickack_mode(tp);
		kfree_skb(skb);
		return;
	}

	if (before(skb->seq, tp->rcv_nxt)) {
		/* Partial packet, seq < rcv_next < end_seq */
		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
			   tp->rcv_nxt, skb->seq, skb->end_seq);

		goto queue_and_out;
	}

	/* Ok. This is an out_of_order segment, force an ack. */
	tp->delayed_acks++;
	tcp_enter_quickack_mode(tp);

	/* Disable header predition. */
	tp->pred_flags = 0;

	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
		   tp->rcv_nxt, skb->seq, skb->end_seq);

	if (skb_peek(&tp->out_of_order_queue) == NULL) {
		/* Initial out of order segment, build 1 SACK. */
		if(tp->sack_ok) {
			tp->num_sacks = 1;
			tp->selective_acks[0].start_seq = skb->seq;
			tp->selective_acks[0].end_seq = skb->end_seq;
		}
		skb_queue_head(&tp->out_of_order_queue,skb);
	} else {
		for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
			/* Already there. */
			if (skb->seq == skb1->seq) {
				if (skb->len >= skb1->len) {
					if(tp->sack_ok)
						tcp_sack_extend(tp, skb1, skb);
					skb_append(skb1, skb);
					skb_unlink(skb1);
					kfree_skb(skb1);
				} else {
					/* A duplicate, smaller than what is in the
					 * out-of-order queue right now, toss it.
					 */
					kfree_skb(skb);
				}
				break;
			}
			
			if (after(skb->seq, skb1->seq)) {
				skb_append(skb1,skb);
				if(tp->sack_ok)
					tcp_sack_new_ofo_skb(sk, skb);
				break;
			}

                        /* See if we've hit the start. If so insert. */
			if (skb1 == skb_peek(&tp->out_of_order_queue)) {
				skb_queue_head(&tp->out_of_order_queue,skb);
				if(tp->sack_ok)
					tcp_sack_new_ofo_skb(sk, skb);
				break;
			}
		}
	}
}


/*
 *	This routine handles the data.  If there is room in the buffer,
 *	it will be have already been moved into it.  If there is no
 *	room, then we will just have to discard the packet.
 */

static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
{
	struct tcphdr *th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	th = skb->h.th;
	skb_pull(skb, th->doff*4);
	skb_trim(skb, len - (th->doff*4));

        if (skb->len == 0 && !th->fin)
		return(0);

	/* We no longer have anyone receiving data on this connection. */
	tcp_data_queue(sk, skb);

	if (before(tp->rcv_nxt, tp->copied_seq)) {
		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
		tp->rcv_nxt = tp->copied_seq;
	}

	/* Above, tcp_data_queue() increments delayed_acks appropriately.
	 * Now tell the user we may have some data.
	 */
	if (!sk->dead) {
		SOCK_DEBUG(sk, "Data wakeup.\n");
		sk->data_ready(sk,0);
	}
	return(1);
}

static void tcp_data_snd_check(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sk_buff *skb;

	if ((skb = tp->send_head)) {
		if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
		    tp->packets_out < tp->snd_cwnd ) {
			/* Put more data onto the wire. */
			tcp_write_xmit(sk);
		} else if (tp->packets_out == 0 && !tp->pending) {
 			/* Start probing the receivers window. */
 			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 		}
	}
}

/*
 * Check if sending an ack is needed.
 */
static __inline__ void __tcp_ack_snd_check(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* This also takes care of updating the window.
	 * This if statement needs to be simplified.
	 *
	 * Rules for delaying an ack:
	 *      - delay time <= 0.5 HZ
	 *      - we don't have a window update to send
	 *      - must send at least every 2 full sized packets
	 *	- must send an ACK if we have any SACKs
	 *
	 * With an extra heuristic to handle loss of packet
	 * situations and also helping the sender leave slow
	 * start in an expediant manner.
	 */

	    /* Two full frames received or... */
	if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) ||
	    /* We will update the window "significantly" or... */
	    tcp_raise_window(sk) ||
	    /* We entered "quick ACK" mode or... */
	    tcp_in_quickack_mode(tp) ||
	    /* We have pending SACKs */
	    (tp->sack_ok && tp->num_sacks)) {
		/* Then ack it now */
		tcp_send_ack(sk);
	} else {
		/* Else, send delayed ack. */
		tcp_send_delayed_ack(tp, HZ/2);
	}
}

static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	if (tp->delayed_acks == 0) {
		/* We sent a data segment already. */
		return;
	}
	__tcp_ack_snd_check(sk);
}


/*
 *	This routine is only called when we have urgent data
 *	signalled. Its the 'slow' part of tcp_urg. It could be
 *	moved inline now as tcp_urg is only called from one
 *	place. We handle URGent data wrong. We have to - as
 *	BSD still doesn't use the correction from RFC961.
 *	For 1003.1g we should support a new option TCP_STDURG to permit
 *	either form (or just set the sysctl tcp_stdurg).
 */
 
static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 ptr = ntohs(th->urg_ptr);

	if (ptr && !sysctl_tcp_stdurg)
		ptr--;
	ptr += ntohl(th->seq);

	/* Ignore urgent data that we've already seen and read. */
	if (after(tp->copied_seq, ptr))
		return;

	/* Do we already have a newer (or duplicate) urgent pointer? */
	if (tp->urg_data && !after(ptr, tp->urg_seq))
		return;

	/* Tell the world about our new urgent pointer. */
	if (sk->proc != 0) {
		if (sk->proc > 0)
			kill_proc(sk->proc, SIGURG, 1);
		else
			kill_pg(-sk->proc, SIGURG, 1);
	}

	/* We may be adding urgent data when the last byte read was
	 * urgent. To do this requires some care. We cannot just ignore
	 * tp->copied_seq since we would read the last urgent byte again
	 * as data, nor can we alter copied_seq until this data arrives
	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
	 */
	if (tp->urg_seq == tp->copied_seq)
		tp->copied_seq++;	/* Move the copied sequence on correctly */
	tp->urg_data = URG_NOTYET;
	tp->urg_seq = ptr;

	/* Disable header prediction. */
	tp->pred_flags = 0;
}

/* This is the 'fast' part of urgent handling. */
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Check if we get a new urgent pointer - normally not. */
	if (th->urg)
		tcp_check_urg(sk,th);

	/* Do we wait for any urgent data? - normally not... */
	if (tp->urg_data == URG_NOTYET) {
		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);

		/* Is the urgent pointer pointing into this packet? */	 
		if (ptr < len) {
			tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
			if (!sk->dead)
				sk->data_ready(sk,0);
		}
	}
}

/*
 * Clean first the out_of_order queue, then the receive queue until
 * the socket is in its memory limits again.
 */
static void prune_queue(struct sock *sk)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
	struct sk_buff * skb;

	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);

	/* First Clean the out_of_order queue. */
	/* Start with the end because there are probably the least
	 * useful packets (crossing fingers).
	 */
	while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { 
		kfree_skb(skb);
		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
			return;
	}
	
	/* Now continue with the receive queue if it wasn't enough */
	while ((skb = skb_peek_tail(&sk->receive_queue))) {
		/* Never toss anything when we've seen the FIN.
		 * It's just too complex to recover from it.
		 */
		if(skb->h.th->fin)
			break;

		/* Never remove packets that have been already acked */
		if (before(skb->end_seq, tp->last_ack_sent+1)) {
			printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
				tp->copied_seq, skb->end_seq, tp->last_ack_sent);
			break; 
		}
		skb_unlink(skb);
		tp->rcv_nxt = skb->seq;
		SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
			   skb->seq, skb->end_seq, tp->copied_seq); 
		kfree_skb(skb);
		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) 
			break;
	}
}

int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
			struct tcphdr *th, __u16 len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;
	u32 flg;

	/*
	 *	Header prediction.
	 *	The code follows the one in the famous 
	 *	"30 instruction TCP receive" Van Jacobson mail.
	 *	
	 *	Van's trick is to deposit buffers into socket queue 
	 *	on a device interrupt, to call tcp_recv function
	 *	on the receive process context and checksum and copy
	 *	the buffer to user space. smart...
	 *
	 *	Our current scheme is not silly either but we take the 
	 *	extra cost of the net_bh soft interrupt processing...
	 *	We do checksum and copy also but from device to kernel.
	 */

	/*
	 * RFC1323: H1. Apply PAWS check first.
	 */
	if (tcp_fast_parse_options(sk, th, tp)) {
		if (tp->saw_tstamp) {
			if (tcp_paws_discard(tp)) {
				if (!th->rst) {
					tcp_send_ack(sk);
					goto discard;
				}
			}
			tcp_replace_ts_recent(tp,skb->end_seq);
		}
	}

	flg = *(((u32 *)th) + 3);
		
	/*	pred_flags is 0xS?10 << 16 + snd_wnd
	 *	if header_predition is to be made
	 *	'S' will always be tp->tcp_header_len >> 2
	 *	'?' will be 0 else it will be !0
	 *	(when there are holes in the receive 
	 *	 space for instance)
	 */

	if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) {
		if (len <= th->doff*4) {
			/* Bulk data transfer: sender */
			if (len == th->doff*4) {
				tcp_ack(sk, th, skb->seq, skb->ack_seq, len); 
				kfree_skb(skb); 
				tcp_data_snd_check(sk);
				return 0;
			} else { /* Header too small */
				tcp_statistics.TcpInErrs++;
				goto discard;
			}
		} else if (skb->ack_seq == tp->snd_una) {
			/* Bulk data transfer: receiver */
			if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) 
				goto discard;
			
			skb_pull(skb,th->doff*4);

			/* DO NOT notify forward progress here.
			 * It saves dozen of CPU instructions in fast path. --ANK
			 */
			skb_queue_tail(&sk->receive_queue, skb);
			tp->rcv_nxt = skb->end_seq;

			/* FIN bit check is not done since if FIN is set in
			 * this frame, the pred_flags won't match up. -DaveM
			 */
			sk->data_ready(sk, 0);
			tcp_delack_estimator(tp);

			/* Tiny-grams with PSH set make us ACK quickly. */
			if(th->psh && (skb->len < (sk->mss >> 1)))
				tp->ato = HZ/50;

			tp->delayed_acks++;
			__tcp_ack_snd_check(sk);
			return 0;
		}
	}

	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
		if (!th->rst) {
			if (after(skb->seq, tp->rcv_nxt)) {
				SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
					   skb->seq, skb->end_seq,
					   tp->rcv_wup, tp->rcv_wnd);
			}
			tcp_send_ack(sk);
			goto discard;
		}
	}

	if(th->syn && skb->seq != tp->syn_seq) {
		SOCK_DEBUG(sk, "syn in established state\n");
		tcp_statistics.TcpInErrs++;
		tcp_reset(sk, skb);
		return 1;
	}
	
	if(th->rst) {
		tcp_reset(sk,skb);
		goto discard;
	}

	if(th->ack)
		tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
	
	/* Process urgent data. */
	tcp_urg(sk, th, len);

	/* step 7: process the segment text */
	queued = tcp_data(skb, sk, len);

	tcp_data_snd_check(sk);

	/* If our receive queue has grown past its limits shrink it */
	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
		prune_queue(sk);

	tcp_ack_snd_check(sk);

	if (!queued) {
	discard:
		kfree_skb(skb);
	}

	return 0;
}

/* Shared between IPv4 and IPv6 now. */
struct sock *
tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/*	assumption: the socket is not in use.
	 *	as we checked the user count on tcp_rcv and we're
	 *	running from a soft interrupt.
	 */

	if (req->sk) {
		/*	socket already created but not
		 *	yet accepted()...
		 */
		sk = req->sk;
	} else {
		u32 flg;

		/* Check for syn retransmission */
		flg = *(((u32 *)skb->h.th) + 3);
		
		flg &= __constant_htonl(0x00170000);
		/* Only SYN set? */
		if (flg == __constant_htonl(0x00020000)) {
			if (!after(skb->seq, req->rcv_isn)) {
				/*	retransmited syn.
				 */
				req->class->rtx_syn_ack(sk, req); 
				return NULL;
			} else {
				return sk; /* New SYN */
			}
		}

		/* We know it's an ACK here */
		/* In theory the packet could be for a cookie, but
		 * TIME_WAIT should guard us against this. 
		 * XXX: Nevertheless check for cookies?
		 * This sequence number check is done again later,
		 * but we do it here to prevent syn flood attackers
		 * from creating big SYN_RECV sockets.
		 */ 
		if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) ||
		    !between(skb->seq, req->rcv_isn, 
			     req->rcv_isn+1+req->rcv_wnd)) {
			req->class->send_reset(skb);
			return NULL;
		}
	
		sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
		tcp_dec_slow_timer(TCP_SLT_SYNACK);
		if (sk == NULL)
			return NULL;
		
		req->expires = 0UL;
		req->sk = sk;
	}
	skb_orphan(skb); 
	skb_set_owner_r(skb, sk);
	return sk; 
}

/*
 *	This function implements the receiving procedure of RFC 793.
 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *	address independent.
 */
	
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, void *opt, __u16 len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;

	/* state == CLOSED, hash lookup always fails, so no worries. -DaveM */
	switch (sk->state) {
	case TCP_LISTEN:
		/* These use the socket TOS.. 
		 * might want to be the received TOS 
		 */
		if(th->ack)
			return 1;
		
		if(th->syn) {
			if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0)
				return 1;

			/* Now we have several options: In theory there is 
			 * nothing else in the frame. KA9Q has an option to 
			 * send data with the syn, BSD accepts data with the
			 * syn up to the [to be] advertised window and 
			 * Solaris 2.1 gives you a protocol error. For now 
			 * we just ignore it, that fits the spec precisely 
			 * and avoids incompatibilities. It would be nice in
			 * future to drop through and process the data.
			 *
			 * Now that TTCP is starting to be used we ought to 
			 * queue this data.
			 * But, this leaves one open to an easy denial of
		 	 * service attack, and SYN cookies can't defend
			 * against this problem. So, we drop the data
			 * in the interest of security over speed.
			 */
			goto discard;
		}
		
		goto discard;
		break;

	case TCP_SYN_SENT:
		/* SYN sent means we have to look for a suitable ack and 
		 * either reset for bad matches or go to connected. 
		 * The SYN_SENT case is unusual and should
		 * not be in line code. [AC]
		 */
		if(th->ack) {
			tp->snd_wl1 = skb->seq;

			/* We got an ack, but it's not a good ack. */
			if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) {
				tcp_statistics.TcpAttemptFails++;
				return 1;
			}

			if(th->rst) {
				tcp_reset(sk,skb);
				goto discard;
			}

			if(!th->syn) {
				/* A valid ack from a different connection
				 * start.  Shouldn't happen but cover it.
				 */
				tcp_statistics.TcpAttemptFails++;
				return 1;
			}

			/* Ok.. it's good. Set up sequence numbers and
			 * move to established.
			 */
			tp->rcv_nxt = skb->seq+1;
			tp->rcv_wup = skb->seq+1;

			tp->snd_wnd = htons(th->window) << tp->snd_wscale;
			tp->snd_wl1 = skb->seq;
			tp->snd_wl2 = skb->ack_seq;
			tp->fin_seq = skb->seq;

			tcp_set_state(sk, TCP_ESTABLISHED);
			tcp_parse_options(sk, th, tp, 0);

        		if (tp->wscale_ok == 0) {
                		tp->snd_wscale = tp->rcv_wscale = 0;
                		tp->window_clamp = min(tp->window_clamp,65535);
        		}
			if (tp->tstamp_ok) {
				tp->tcp_header_len =
					sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
			} else
				tp->tcp_header_len = sizeof(struct tcphdr);
			if (tp->saw_tstamp) {
				tp->ts_recent = tp->rcv_tsval;
				tp->ts_recent_stamp = jiffies;
			}

			/* Can't be earlier, doff would be wrong. */
			tcp_send_ack(sk);

			/* Check for the case where we tried to advertise
			 * a window including timestamp options, but did not
			 * end up using them for this connection.
			 */
			if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
				sk->mss += TCPOLEN_TSTAMP_ALIGNED;
			
			/* Now limit it if the other end negotiated a smaller
			 * value.
			 */
			if (tp->in_mss) {
				int real_mss = tp->in_mss;

				/* We store MSS locally with the timestamp bytes
				 * subtracted, TCP's advertise it with them
				 * included.  Account for this fact.
				 */
				if(tp->tstamp_ok)
					real_mss -= TCPOLEN_TSTAMP_ALIGNED;
				sk->mss = min(sk->mss, real_mss);
			}

			sk->dport = th->source;
			tp->copied_seq = tp->rcv_nxt;

			if(!sk->dead) {
				sk->state_change(sk);
				sock_wake_async(sk->socket, 0);
			}

			/* Drop through step 6 */
			goto step6;
		} else {
			if(th->syn && !th->rst) {
				/* The previous version of the code
				 * checked for "connecting to self"
				 * here. that check is done now in
				 * tcp_connect.
				 */
				tcp_set_state(sk, TCP_SYN_RECV);
				tcp_parse_options(sk, th, tp, 0);
				if (tp->saw_tstamp) {
					tp->ts_recent = tp->rcv_tsval;
					tp->ts_recent_stamp = jiffies;
				}
				
				tp->rcv_nxt = skb->seq + 1;
				tp->rcv_wup = skb->seq + 1;

				tp->snd_wnd = htons(th->window);
				tp->snd_wl1 = skb->seq;
				
				tcp_send_synack(sk);
				goto discard;
			}		

		}
		break;
	}

	/*   Parse the tcp_options present on this header.
	 *   By this point we really only expect timestamps.
	 *   Note that this really has to be here and not later for PAWS
	 *   (RFC1323) to work.
	 */
	if (tcp_fast_parse_options(sk, th, tp)) {
		/* NOTE: assumes saw_tstamp is never set if we didn't
		 * negotiate the option. tcp_fast_parse_options() must
		 * guarantee this.
		 */
		if (tp->saw_tstamp) {
			if (tcp_paws_discard(tp)) {
				if (!th->rst) {
					tcp_send_ack(sk);
					goto discard;
				}
			}
			tcp_replace_ts_recent(tp,skb->end_seq);
		}
	}

	/* step 1: check sequence number */
	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
		if (!th->rst) {
			tcp_send_ack(sk);
			goto discard;
		}
	}

	/* step 2: check RST bit */
	if(th->rst) {
		tcp_reset(sk,skb);
		goto discard;
	}

	/* step 3: check security and precedence [ignored] */

	/*	step 4:
	 *
	 *	Check for a SYN, and ensure it matches the SYN we were
	 *	first sent. We have to handle the rather unusual (but valid)
	 *	sequence that KA9Q derived products may generate of
	 *
	 *	SYN
	 *				SYN|ACK Data
	 *	ACK	(lost)
	 *				SYN|ACK Data + More Data
	 *	.. we must ACK not RST...
	 *
	 *	We keep syn_seq as the sequence space occupied by the 
	 *	original syn. 
	 */

	if (th->syn && skb->seq!=tp->syn_seq) {
		tcp_reset(sk, skb);
		return 1;
	}

	/* step 5: check the ACK field */
	if (th->ack) {
		int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
		
		switch(sk->state) {
		case TCP_SYN_RECV:
			if (acceptable) {
				tcp_set_state(sk, TCP_ESTABLISHED);
				sk->dport = th->source;
				tp->copied_seq = tp->rcv_nxt;

				if(!sk->dead)
					sk->state_change(sk);		

				tp->snd_una = skb->ack_seq;
				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
				tp->snd_wl1 = skb->seq;
				tp->snd_wl2 = skb->ack_seq;

			} else {
				SOCK_DEBUG(sk, "bad ack\n");
				return 1;
			}
			break;

		case TCP_FIN_WAIT1:
			if (tp->snd_una == tp->write_seq) {
				sk->shutdown |= SEND_SHUTDOWN;
				tcp_set_state(sk, TCP_FIN_WAIT2);
				if (!sk->dead)
					sk->state_change(sk);
				else
					tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
			}
			break;

		case TCP_CLOSING:	
			if (tp->snd_una == tp->write_seq)
				tcp_time_wait(sk);
			break;

		case TCP_LAST_ACK:
			if (tp->snd_una == tp->write_seq) {
				sk->shutdown = SHUTDOWN_MASK;
				tcp_set_state(sk,TCP_CLOSE);
				if (!sk->dead)
					sk->state_change(sk);
				goto discard;
			}
			break;
		}
	} else
		goto discard;

step6:
	/* step 6: check the URG bit */
	tcp_urg(sk, th, len);

	/* step 7: process the segment text */
	switch (sk->state) {
	case TCP_CLOSE_WAIT:
	case TCP_CLOSING:
		if (!before(skb->seq, tp->fin_seq))
			break;
	
	case TCP_FIN_WAIT1:
	case TCP_FIN_WAIT2:
		/* RFC 793 says to queue data in these states,
		 * RFC 1122 says we MUST send a reset. 
		 * BSD 4.4 also does reset.
		 */
		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
			if (after(skb->end_seq - th->fin, tp->rcv_nxt)) {
				tcp_reset(sk, skb);
				return 1;
			}
		}
		
	case TCP_ESTABLISHED: 
		queued = tcp_data(skb, sk, len);

		/* This can only happen when MTU+skbheader > rcvbuf */
		if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
			prune_queue(sk);
		break;
	}

	tcp_data_snd_check(sk);
	tcp_ack_snd_check(sk);

	if (!queued) { 
discard:
		kfree_skb(skb);
	}
	return 0;
}

int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp,
			 void *buffer, size_t *lenp)
{
	int val = sysctl_tcp_cong_avoidance;
	int retv;
	static tcp_sys_cong_ctl_t tab[] = { 
		tcp_cong_avoid_vanj, 
		tcp_cong_avoid_vegas
	};

	retv = proc_dointvec(ctl, write, filp, buffer, lenp);

	if (write) {
		if ((unsigned)sysctl_tcp_cong_avoidance > 1) {
			retv = -EINVAL;
			sysctl_tcp_cong_avoidance = val;
		} else {
			tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance];
		}
	}
	return retv;
}