diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-08-28 22:00:09 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-08-28 22:00:09 +0000 |
commit | 1a1d77dd589de5a567fa95e36aa6999c704ceca4 (patch) | |
tree | 141e31f89f18b9fe0831f31852e0435ceaccafc5 /net/ipv4/tcp_timer.c | |
parent | fb9c690a18b3d66925a65b17441c37fa14d4370b (diff) |
Merge with 2.4.0-test7.
Diffstat (limited to 'net/ipv4/tcp_timer.c')
-rw-r--r-- | net/ipv4/tcp_timer.c | 502 |
1 files changed, 160 insertions, 342 deletions
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ed38175b..d98376840 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.77 2000/06/30 10:18:38 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.79 2000/08/11 00:13:36 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -29,13 +29,11 @@ int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; -int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES; +int sysctl_tcp_orphan_retries = 0; -static void tcp_retransmit_timer(unsigned long); +static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); -static void tcp_probe_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -static void tcp_twkill(unsigned long); const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; @@ -50,73 +48,35 @@ void tcp_init_xmit_timers(struct sock *sk) struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_retransmit_timer; + tp->retransmit_timer.function=&tcp_write_timer; tp->retransmit_timer.data = (unsigned long) sk; + tp->pending = 0; init_timer(&tp->delack_timer); tp->delack_timer.function=&tcp_delack_timer; tp->delack_timer.data = (unsigned long) sk; - - init_timer(&tp->probe_timer); - tp->probe_timer.function=&tcp_probe_timer; - tp->probe_timer.data = (unsigned long) sk; + tp->ack.pending = 0; init_timer(&sk->timer); sk->timer.function=&tcp_keepalive_timer; sk->timer.data = (unsigned long) sk; } -/* - * Reset the retransmission timer - */ - -void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - switch (what) { - case TCP_TIME_RETRANS: - /* When seting the transmit timer the probe timer - * should not be set. - * The delayed ack timer can be set if we are changing the - * retransmit timer when removing acked frames. - */ - if (timer_pending(&tp->probe_timer) && del_timer(&tp->probe_timer)) - __sock_put(sk); - if (when > TCP_RTO_MAX) { - printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk)); - when = TCP_RTO_MAX; - } - if (!mod_timer(&tp->retransmit_timer, jiffies+when)) - sock_hold(sk); - break; - - case TCP_TIME_DACK: - if (!mod_timer(&tp->delack_timer, jiffies+when)) - sock_hold(sk); - break; - - case TCP_TIME_PROBE0: - if (!mod_timer(&tp->probe_timer, jiffies+when)) - sock_hold(sk); - break; - - default: - printk(KERN_DEBUG "bug: unknown timer value\n"); - }; -} - void tcp_clear_xmit_timers(struct sock *sk) -{ +{ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if(timer_pending(&tp->retransmit_timer) && del_timer(&tp->retransmit_timer)) - __sock_put(sk); - if(timer_pending(&tp->delack_timer) && del_timer(&tp->delack_timer)) + tp->pending = 0; + if (timer_pending(&tp->retransmit_timer) && + del_timer(&tp->retransmit_timer)) __sock_put(sk); + + tp->ack.pending = 0; tp->ack.blocked = 0; - if(timer_pending(&tp->probe_timer) && del_timer(&tp->probe_timer)) + if (timer_pending(&tp->delack_timer) && + del_timer(&tp->delack_timer)) __sock_put(sk); + if(timer_pending(&sk->timer) && del_timer(&sk->timer)) __sock_put(sk); } @@ -127,6 +87,7 @@ static void tcp_write_err(struct sock *sk) sk->error_report(sk); tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnTimeout); } /* Do not allow orphaned sockets to eat all our resources. @@ -138,26 +99,60 @@ static void tcp_write_err(struct sock *sk) * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. - * 2. Under pessimistic assumption that all the orphans eat memory not - * less than this one, total consumed memory exceeds all - * the available memory. + * 2. If we have strong memory pressure. */ static int tcp_out_of_resources(struct sock *sk, int do_reset) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int orphans = atomic_read(&tcp_orphan_count); + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + orphans <<= 1; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->err_soft) + orphans <<= 1; + if (orphans >= sysctl_tcp_max_orphans || - ((orphans*atomic_read(&sk->wmem_alloc))>>PAGE_SHIFT) >= num_physpages) { + (sk->wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnMemory); return 1; } return 0; } +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { @@ -195,10 +190,12 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sk->dead) { - if (tcp_out_of_resources(sk, tp->retransmits < retry_until)) - return 1; + int alive = (tp->rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); - retry_until = sysctl_tcp_orphan_retries; + if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + return 1; } } @@ -220,14 +217,38 @@ static void tcp_delack_timer(unsigned long data) /* Try again later. */ tp->ack.blocked = 1; NET_INC_STATS_BH(DelayedACKLocked); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN)) + sock_hold(sk); goto out_unlock; } - if (tp->ack.pending) { + tcp_mem_reclaim(sk); + + if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER)) + goto out; + + if ((long)(tp->ack.timeout - jiffies) > 0) { + if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) + sock_hold(sk); + goto out; + } + tp->ack.pending &= ~TCP_ACK_TIMER; + + if (skb_queue_len(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (tcp_ack_scheduled(tp)) { if (!tp->ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX); + tp->ack.ato = min(tp->ack.ato<<1, tp->rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. @@ -240,30 +261,22 @@ static void tcp_delack_timer(unsigned long data) } TCP_CHECK_TIMER(sk); +out: + if (tcp_memory_pressure) + tcp_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); } -static void tcp_probe_timer(unsigned long data) +static void tcp_probe_timer(struct sock *sk) { - struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int max_probes; - bh_lock_sock(sk); - if (sk->lock.users) { - /* Try again later. */ - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5); - goto out_unlock; - } - - if (sk->state == TCP_CLOSE) - goto out_unlock; - if (tp->packets_out || !tp->send_head) { tp->probes_out = 0; - goto out_unlock; + return; } /* *WARNING* RFC 1122 forbids this @@ -284,10 +297,12 @@ static void tcp_probe_timer(unsigned long data) max_probes = sysctl_tcp_retries2; if (sk->dead) { - if (tcp_out_of_resources(sk, tp->probes_out <= max_probes)) - goto out_unlock; + int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); - max_probes = sysctl_tcp_orphan_retries; + if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + return; } if (tp->probes_out > max_probes) { @@ -295,284 +310,47 @@ static void tcp_probe_timer(unsigned long data) } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); - TCP_CHECK_TIMER(sk); } -out_unlock: - bh_unlock_sock(sk); - sock_put(sk); } - -/* Kill off TIME_WAIT sockets once their lifetime has expired. */ -static int tcp_tw_death_row_slot = 0; -int tcp_tw_count = 0; - -static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; -static struct timer_list tcp_tw_timer = { function: tcp_twkill }; - -static void SMP_TIMER_NAME(tcp_twkill)(unsigned long dummy) -{ - struct tcp_tw_bucket *tw; - int killed = 0; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - spin_lock(&tw_death_lock); - - if (tcp_tw_count == 0) - goto out; - - while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { - tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; - tw->pprev_death = NULL; - spin_unlock(&tw_death_lock); - - tcp_timewait_kill(tw); - tcp_tw_put(tw); - - killed++; - - spin_lock(&tw_death_lock); - } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - - if ((tcp_tw_count -= killed) != 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - net_statistics[smp_processor_id()*2].TimeWaited += killed; -out: - spin_unlock(&tw_death_lock); -} - -SMP_TIMER_DEFINE(tcp_twkill, tcp_twkill_task); - -/* These are always called from BH context. See callers in - * tcp_input.c to verify this. - */ - -/* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) -{ - spin_lock(&tw_death_lock); - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_put(tw); - if (--tcp_tw_count == 0) - del_timer(&tcp_tw_timer); - } - spin_unlock(&tw_death_lock); -} - -/* Short-time timewait calendar */ - -static int tcp_twcal_hand = -1; -static int tcp_twcal_jiffie; -static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = {function: tcp_twcal_tick}; -static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; - -void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) -{ - struct tcp_tw_bucket **tpp; - int slot; - - /* timeout := RTO * 3.5 - * - * 3.5 = 1+2+0.5 to wait for two retransmits. - * - * RATIONALE: if FIN arrived and we entered TIME-WAIT state, - * our ACK acking that FIN can be lost. If N subsequent retransmitted - * FINs (or previous seqments) are lost (probability of such event - * is p^(N+1), where p is probability to lose single packet and - * time to detect the loss is about RTO*(2^N - 1) with exponential - * backoff). Normal timewait length is calculated so, that we - * waited at least for one retransmitted FIN (maximal RTO is 120sec). - * [ BTW Linux. following BSD, violates this requirement waiting - * only for 60sec, we should wait at least for 240 secs. - * Well, 240 consumes too much of resources 8) - * ] - * This interval is not reduced to catch old duplicate and - * responces to our wandering segments living for two MSLs. - * However, if we use PAWS to detect - * old duplicates, we can reduce the interval to bounds required - * by RTO, rather than MSL. So, if peer understands PAWS, we - * kill tw bucket after 3.5*RTO (it is important that this number - * is greater than TS tick!) and detect old duplicates with help - * of PAWS. - */ - slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; - - spin_lock(&tw_death_lock); - - /* Unlink it, if it was scheduled */ - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_count--; - } else - atomic_inc(&tw->refcnt); - - if (slot >= TCP_TW_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= TCP_TIMEWAIT_LEN) { - slot = TCP_TWKILL_SLOTS-1; - } else { - slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; - if (slot >= TCP_TWKILL_SLOTS) - slot = TCP_TWKILL_SLOTS-1; - } - tw->ttd = jiffies + timeo; - slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; - } else { - tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK); - - if (tcp_twcal_hand < 0) { - tcp_twcal_hand = 0; - tcp_twcal_jiffie = jiffies; - tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); - add_timer(&tcp_twcal_timer); - } else { - if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK)) - mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); - slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); - } - tpp = &tcp_twcal_row[slot]; - } - - if((tw->next_death = *tpp) != NULL) - (*tpp)->pprev_death = &tw->next_death; - *tpp = tw; - tw->pprev_death = tpp; - - if (tcp_tw_count++ == 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - spin_unlock(&tw_death_lock); -} - -void SMP_TIMER_NAME(tcp_twcal_tick)(unsigned long dummy) -{ - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - spin_lock(&tw_death_lock); - if (tcp_twcal_hand < 0) - goto out; - - slot = tcp_twcal_hand; - j = tcp_twcal_jiffie; - - for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { - if ((long)(j - now) <= 0) { - struct tcp_tw_bucket *tw; - - while((tw = tcp_twcal_row[slot]) != NULL) { - tcp_twcal_row[slot] = tw->next_death; - tw->pprev_death = NULL; - - tcp_timewait_kill(tw); - tcp_tw_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - tcp_twcal_jiffie = j; - tcp_twcal_hand = slot; - } - - if (tcp_twcal_row[slot] != NULL) { - mod_timer(&tcp_twcal_timer, j); - goto out; - } - } - j += (1<<TCP_TW_RECYCLE_TICK); - slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); - } - tcp_twcal_hand = -1; - -out: - if ((tcp_tw_count -= killed) == 0) - del_timer(&tcp_tw_timer); - net_statistics[smp_processor_id()*2].TimeWaitKilled += killed; - spin_unlock(&tw_death_lock); -} - -SMP_TIMER_DEFINE(tcp_twcal_tick, tcp_twcal_tasklet); - /* * The TCP retransmit timer. */ -static void tcp_retransmit_timer(unsigned long data) +static void tcp_retransmit_timer(struct sock *sk) { - struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - bh_lock_sock(sk); - if (sk->lock.users) { - /* Try again later */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20); - goto out_unlock; - } - - if (sk->state == TCP_CLOSE || tp->packets_out == 0) - goto out_unlock; + if (tp->packets_out == 0) + goto out; BUG_TRAP(!skb_queue_empty(&sk->write_queue)); if (tcp_write_timeout(sk)) - goto out_unlock; + goto out; - /* RFC 2018, clear all 'sacked' flags in retransmission queue, - * the sender may have dropped out of order frames and we must - * send them out should this timer fire on us. - */ - if(tp->sack_ok) { - struct sk_buff *skb = skb_peek(&sk->write_queue); - - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - TCP_SKB_CB(skb)->sacked &= - ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS); - skb = skb->next; + if (tp->retransmits == 0) { + if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (tp->sack_ok) { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPSackRecoveryFail); + else + NET_INC_STATS_BH(TCPSackFailures); + } else { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPRenoRecoveryFail); + else + NET_INC_STATS_BH(TCPRenoFailures); + } + } else if (tp->ca_state == TCP_CA_Loss) { + NET_INC_STATS_BH(TCPLossFailures); + } else { + NET_INC_STATS_BH(TCPTimeouts); } } - /* Retransmission. */ - tp->retrans_head = NULL; - tp->rexmt_done = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - if (tp->retransmits == 0) { - /* Remember window where we lost: - * "one half of the current window but at least 2 segments" - * - * Here "current window" means the effective one, which - * means it must be an accurate representation of our current - * sending rate _and_ the snd_wnd. - */ - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd = 1; - } + tcp_enter_loss(sk, 0); - tp->dup_acks = 0; - tp->high_seq = tp->snd_nxt; if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. @@ -581,8 +359,7 @@ static void tcp_retransmit_timer(unsigned long data) tp->retransmits=1; tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); - TCP_CHECK_TIMER(sk); - goto out_unlock; + goto out; } /* Increase the timeout each time we retransmit. Note that @@ -606,8 +383,48 @@ static void tcp_retransmit_timer(unsigned long data) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); if (tp->retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); + +out: +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int event; + + bh_lock_sock(sk); + if (sk->lock.users) { + /* Try again later */ + if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20))) + sock_hold(sk); + goto out_unlock; + } + + if (sk->state == TCP_CLOSE || !tp->pending) + goto out; + + if ((long)(tp->timeout - jiffies) > 0) { + if (!mod_timer(&tp->retransmit_timer, tp->timeout)) + sock_hold(sk); + goto out; + } + + event = tp->pending; + tp->pending = 0; + + switch (event) { + case TCP_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case TCP_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } TCP_CHECK_TIMER(sk); +out: + tcp_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -794,6 +611,7 @@ static void tcp_keepalive_timer (unsigned long data) } TCP_CHECK_TIMER(sk); + tcp_mem_reclaim(sk); resched: tcp_reset_keepalive_timer (sk, elapsed); |