/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_timer.c,v 1.64 1999/05/27 00:37:31 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, * Mark Evans, * Corey Minyard * Florian La Roche, * Charles Hedrick, * Linus Torvalds, * Alan Cox, * Matthew Dillon, * Arnt Gulbrandsen, * Jorge Cwik, */ #include int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; static void tcp_sltimer_handler(unsigned long); static void tcp_syn_recv_timer(unsigned long); static void tcp_keepalive(unsigned long data); static void tcp_bucketgc(unsigned long); static void tcp_twkill(unsigned long); struct timer_list tcp_slow_timer = { NULL, NULL, 0, 0, tcp_sltimer_handler, }; struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}, /* TWKILL */ {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */ }; const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void tcp_init_xmit_timers(struct sock *sk) { init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer); sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer; sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk; init_timer(&sk->tp_pinfo.af_tcp.delack_timer); sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer; sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk; init_timer(&sk->tp_pinfo.af_tcp.probe_timer); sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer; sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk; } /* * Reset the retransmission timer */ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; switch (what) { case TIME_RETRANS: /* When seting the transmit timer the probe timer * should not be set. * The delayed ack timer can be set if we are changing the * retransmit timer when removing acked frames. */ if(tp->probe_timer.prev) del_timer(&tp->probe_timer); mod_timer(&tp->retransmit_timer, jiffies+when); break; case TIME_DACK: mod_timer(&tp->delack_timer, jiffies+when); break; case TIME_PROBE0: mod_timer(&tp->probe_timer, jiffies+when); break; case TIME_WRITE: printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n"); break; default: printk(KERN_DEBUG "bug: unknown timer value\n"); }; } void tcp_clear_xmit_timers(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; if(tp->retransmit_timer.prev) del_timer(&tp->retransmit_timer); if(tp->delack_timer.prev) del_timer(&tp->delack_timer); if(tp->probe_timer.prev) del_timer(&tp->probe_timer); } static int tcp_write_err(struct sock *sk, int force) { sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT; sk->error_report(sk); tcp_clear_xmit_timers(sk); /* Time wait the socket. */ if (!force && ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) { tcp_time_wait(sk); } else { /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); return 0; } return 1; } /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Look for a 'soft' timeout. */ if ((sk->state == TCP_ESTABLISHED && tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { dst_negative_advice(&sk->dst_cache); } /* Have we tried to SYN too many times (repent repent 8)) */ if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) { tcp_write_err(sk, 1); /* Don't FIN, we got nothing back */ return 0; } /* Has it gone just too far? */ if (tp->retransmits > sysctl_tcp_retries2) return tcp_write_err(sk, 0); return 1; } void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; bh_lock_sock(sk); if(!sk->zapped && sk->tp_pinfo.af_tcp.delayed_acks && sk->state != TCP_CLOSE) { if (!sk->lock.users) tcp_send_ack(sk); else tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10); } bh_unlock_sock(sk); } void tcp_probe_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; if(sk->zapped) return; bh_lock_sock(sk); if (sk->lock.users) { /* Try again later. */ tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); bh_unlock_sock(sk); return; } /* *WARNING* RFC 1122 forbids this * It doesn't AFAIK, because we kill the retransmit timer -AK * FIXME: We ought not to do it, Solaris 2.5 actually has fixing * this behaviour in Solaris down as a bug fix. [AC] */ if (tp->probes_out > sysctl_tcp_retries2) { if(sk->err_soft) sk->err = sk->err_soft; else sk->err = ETIMEDOUT; sk->error_report(sk); if ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { /* Time wait the socket. */ tcp_time_wait(sk); } else { /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); } } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } bh_unlock_sock(sk); } static __inline__ int tcp_keepopen_proc(struct sock *sk) { int res = 0; if ((1<state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; if (elapsed >= sysctl_tcp_keepalive_time) { if (tp->probes_out > sysctl_tcp_keepalive_probes) { if(sk->err_soft) sk->err = sk->err_soft; else sk->err = ETIMEDOUT; tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); } else { tp->probes_out++; tp->pending = TIME_KEEPOPEN; tcp_write_wakeup(sk); res = 1; } } } return res; } /* Garbage collect TCP bind buckets. */ static void tcp_bucketgc(unsigned long data) { int i, reaped = 0;; SOCKHASH_LOCK_WRITE_BH(); for(i = 0; i < tcp_bhash_size; i++) { struct tcp_bind_bucket *tb = tcp_bhash[i]; while(tb) { struct tcp_bind_bucket *next = tb->next; if((tb->owners == NULL) && !(tb->flags & TCPB_FLAG_LOCKED)) { reaped++; /* Unlink bucket. */ if(tb->next) tb->next->pprev = tb->pprev; *tb->pprev = tb->next; /* Finally, free it up. */ kmem_cache_free(tcp_bucket_cachep, tb); } tb = next; } } SOCKHASH_UNLOCK_WRITE_BH(); if(reaped != 0) { struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; /* Eat timer references. */ atomic_sub(reaped, &slt->count); } } /* Kill off TIME_WAIT sockets once their lifetime has expired. */ int tcp_tw_death_row_slot = 0; static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); static void tcp_twkill(unsigned long data) { struct tcp_tw_bucket *tw; int killed = 0; /* The death-row tw chains are only ever touched * in BH context so no locking is needed. */ tw = tcp_tw_death_row[tcp_tw_death_row_slot]; tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; tcp_tw_death_row_slot = ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); while(tw != NULL) { struct tcp_tw_bucket *next = tw->next_death; tcp_timewait_kill(tw); killed++; tw = next; } if(killed != 0) { struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; atomic_sub(killed, &slt->count); } } /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ void tcp_tw_schedule(struct tcp_tw_bucket *tw) { int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot]; SOCKHASH_LOCK_WRITE_BH(); if((tw->next_death = *tpp) != NULL) (*tpp)->pprev_death = &tw->next_death; *tpp = tw; tw->pprev_death = tpp; tw->death_slot = slot; SOCKHASH_UNLOCK_WRITE_BH(); tcp_inc_slow_timer(TCP_SLT_TWKILL); } /* Happens rarely if at all, no care about scalability here. */ void tcp_tw_reschedule(struct tcp_tw_bucket *tw) { struct tcp_tw_bucket **tpp; int slot; SOCKHASH_LOCK_WRITE_BH(); if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; tw->pprev_death = NULL; slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); tpp = &tcp_tw_death_row[slot]; if((tw->next_death = *tpp) != NULL) (*tpp)->pprev_death = &tw->next_death; *tpp = tw; tw->pprev_death = tpp; tw->death_slot = slot; SOCKHASH_UNLOCK_WRITE_BH(); /* Timer was incremented when we first entered the table. */ } /* This is for handling early-kills of TIME_WAIT sockets. */ void tcp_tw_deschedule(struct tcp_tw_bucket *tw) { SOCKHASH_LOCK_WRITE_BH(); if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; tw->pprev_death = NULL; SOCKHASH_UNLOCK_WRITE_BH(); tcp_dec_slow_timer(TCP_SLT_TWKILL); } /* * Check all sockets for keepalive timer * Called every 75 seconds * This timer is started by af_inet init routine and is constantly * running. * * It might be better to maintain a count of sockets that need it using * setsockopt/tcp_destroy_sk and only set the timer when needed. */ /* * don't send over 5 keepopens at a time to avoid burstiness * on big servers [AC] */ #define MAX_KA_PROBES 5 int sysctl_tcp_max_ka_probes = MAX_KA_PROBES; /* Keepopen's are only valid for "established" TCP's, nicely our listener * hash gets rid of most of the useless testing, so we run through a couple * of the established hash chains each clock tick. -DaveM * * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes * going off for them, so we only need check the first half of the established * hash table, even less testing under heavy load. * * I _really_ would rather do this by adding a new timer_struct to struct sock, * and this way only those who set the keepalive option will get the overhead. * The idea is you set it for 2 hours when the sock is first connected, when it * does fire off (if at all, most sockets die earlier) you check for the keepalive * option and also if the sock has been idle long enough to start probing. */ static void tcp_keepalive(unsigned long data) { static int chain_start = 0; int count = 0; int i; SOCKHASH_LOCK_READ_BH(); for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) { struct sock *sk; sk = tcp_ehash[i]; while(sk) { struct sock *next = sk->next; bh_lock_sock(sk); if (sk->keepopen && !sk->lock.users) { SOCKHASH_UNLOCK_READ_BH(); count += tcp_keepopen_proc(sk); SOCKHASH_LOCK_READ_BH(); } bh_unlock_sock(sk); if(count == sysctl_tcp_max_ka_probes) goto out; sk = next; } } out: SOCKHASH_UNLOCK_READ_BH(); chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) & ((tcp_ehash_size >> 1) - 1)); } /* * The TCP retransmit timer. This lacks a few small details. * * 1. An initial rtt timeout on the probe0 should cause what we can * of the first write queue buffer to be split and sent. * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report * ETIMEDOUT if we know an additional 'soft' error caused this. * tcp_err should save a 'soft error' for us. * [Unless someone has broken it then it does, except for one 2.0 * broken case of a send when the route/device is directly unreachable, * and we error but should retry! - FIXME] [AC] */ void tcp_retransmit_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* We are reset. We will send no more retransmits. */ if(sk->zapped) { tcp_clear_xmit_timer(sk, TIME_RETRANS); return; } bh_lock_sock(sk); if (sk->lock.users) { /* Try again later */ tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); bh_unlock_sock(sk); return; } /* Clear delay ack timer. */ tcp_clear_xmit_timer(sk, TIME_DACK); /* RFC 2018, clear all 'sacked' flags in retransmission queue, * the sender may have dropped out of order frames and we must * send them out should this timer fire on us. */ if(tp->sack_ok) { struct sk_buff *skb = skb_peek(&sk->write_queue); while((skb != NULL) && (skb != tp->send_head) && (skb != (struct sk_buff *)&sk->write_queue)) { TCP_SKB_CB(skb)->sacked &= ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS); skb = skb->next; } } /* Retransmission. */ tp->retrans_head = NULL; tp->rexmt_done = 0; tp->fackets_out = 0; tp->retrans_out = 0; if (tp->retransmits == 0) { /* Remember window where we lost: * "one half of the current window but at least 2 segments" * * Here "current window" means the effective one, which * means it must be an accurate representation of our current * sending rate _and_ the snd_wnd. */ tp->snd_ssthresh = tcp_recalc_ssthresh(tp); tp->snd_cwnd_cnt = 0; tp->snd_cwnd = 1; } tp->retransmits++; tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ tp->backoff++; tp->rto = min(tp->rto << 1, 120*HZ); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); tcp_write_timeout(sk); bh_unlock_sock(sk); } /* * Slow timer for SYN-RECV sockets */ static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now) { struct open_request *prev, *req; prev = (struct open_request *) &tp->syn_wait_queue; for(req = tp->syn_wait_queue; req; ) { struct open_request *next = req->dl_next; if (! req->sk) { tcp_synq_unlink(tp, req, prev); if(req->retrans >= sysctl_tcp_retries1) { (*req->class->destructor)(req); tcp_dec_slow_timer(TCP_SLT_SYNACK); tp->syn_backlog--; tcp_openreq_free(req); if (! tp->syn_wait_queue) break; } else { unsigned long timeo; struct open_request *rp; (*req->class->rtx_syn_ack)(sk, req); req->retrans++; timeo = min((TCP_TIMEOUT_INIT << req->retrans), (120 * HZ)); req->expires = now + timeo; rp = prev->dl_next; tcp_synq_queue(tp, req); if(rp != prev->dl_next) prev = prev->dl_next; } } else prev = req; req = next; } } /* This now scales very nicely. -DaveM */ static void tcp_syn_recv_timer(unsigned long data) { struct sock *sk; unsigned long now = jiffies; int i; SOCKHASH_LOCK_READ_BH(); for(i = 0; i < TCP_LHTABLE_SIZE; i++) { sk = tcp_listening_hash[i]; while(sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* TCP_LISTEN is implied. */ bh_lock_sock(sk); if (!sk->lock.users && tp->syn_wait_queue) tcp_do_syn_queue(sk, tp, now); bh_unlock_sock(sk); sk = sk->next; } } SOCKHASH_UNLOCK_READ_BH(); } void tcp_sltimer_handler(unsigned long data) { struct tcp_sl_timer *slt = tcp_slt_array; unsigned long next = ~0UL; unsigned long now = jiffies; int i; for (i=0; i < TCP_SLT_MAX; i++, slt++) { if (atomic_read(&slt->count)) { long trigger; trigger = slt->period - ((long)(now - slt->last)); if (trigger <= 0) { (*slt->handler)((unsigned long) slt); slt->last = now; trigger = slt->period; } /* Only reschedule if some events remain. */ if (atomic_read(&slt->count)) next = min(next, trigger); } } if (next != ~0UL) mod_timer(&tcp_slow_timer, (now + next)); } void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) { unsigned long now = jiffies; unsigned long when; slt->last = now; when = now + slt->period; if (tcp_slow_timer.prev) { if ((long)(tcp_slow_timer.expires - when) >= 0) mod_timer(&tcp_slow_timer, when); } else { tcp_slow_timer.expires = when; add_timer(&tcp_slow_timer); } }