diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 21 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 9 | ||||
-rw-r--r-- | net/ipv4/ip_fw.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_masq.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_masq_autofw.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_masq_mod.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 289 | ||||
-rw-r--r-- | net/ipv4/proc.c | 6 | ||||
-rw-r--r-- | net/ipv4/route.c | 8 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 127 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 508 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 626 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 1228 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 82 | ||||
-rw-r--r-- | net/ipv4/timer.c | 4 | ||||
-rw-r--r-- | net/ipv4/udp.c | 20 |
18 files changed, 1446 insertions, 1516 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ef1c44620..6667b8d72 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -274,7 +274,7 @@ static int inet_autobind(struct sock *sk) sk->num = sk->prot->good_socknum(); if (sk->num == 0) return(-EAGAIN); - sk->dummy_th.source = htons(sk->num); + sk->sport = htons(sk->num); sk->prot->hash(sk); add_to_prot_sklist(sk); } @@ -304,6 +304,7 @@ int inet_listen(struct socket *sock, int backlog) if (sk->state != TCP_LISTEN) { sk->ack_backlog = 0; sk->state = TCP_LISTEN; + dst_release(xchg(&sk->dst_cache, NULL)); sk->prot->rehash(sk); add_to_prot_sklist(sk); } @@ -348,7 +349,6 @@ static int inet_create(struct socket *sock, int protocol) switch (sock->type) { case SOCK_STREAM: - /* Note for tcp that also wiped the dummy_th block for us. */ if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; @@ -412,17 +412,13 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_mc_index=0; sk->ip_mc_list=NULL; - /* Speed up by setting some standard state for the dummy_th - * if TCP uses it (maybe move to tcp_init later) - */ - if (sk->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ - sk->dummy_th.source = htons(sk->num); + sk->sport = htons(sk->num); /* Add to protocol hash chains. */ sk->prot->hash(sk); @@ -552,9 +548,9 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EADDRINUSE; sk->num = snum; - sk->dummy_th.source = htons(snum); + sk->sport = htons(snum); sk->daddr = 0; - sk->dummy_th.dest = 0; + sk->dport = 0; sk->prot->rehash(sk); add_to_prot_sklist(sk); dst_release(sk->dst_cache); @@ -753,13 +749,13 @@ static int inet_getname(struct socket *sock, struct sockaddr *uaddr, if (peer) { if (!tcp_connected(sk->state)) return(-ENOTCONN); - sin->sin_port = sk->dummy_th.dest; + sin->sin_port = sk->dport; sin->sin_addr.s_addr = sk->daddr; } else { __u32 addr = sk->rcv_saddr; if (!addr) addr = sk->saddr; - sin->sin_port = sk->dummy_th.source; + sin->sin_port = sk->sport; sin->sin_addr.s_addr = addr; } *uaddr_len = sizeof(*sin); @@ -798,7 +794,8 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, struct sock *sk = sock->sk; if (sk->shutdown & SEND_SHUTDOWN) { - send_sig(SIGPIPE, current, 1); + if (!(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 1); return(-EPIPE); } if (sk->prot->sendmsg == NULL) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7ec60a5be..cd9b5ba21 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $ + * Version: $Id: fib_rules.c,v 1.4 1998/03/21 07:27:58 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -86,7 +86,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rtm->rtm_dst_len == r->r_dst_len && (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && rtm->rtm_tos == r->r_tos && - rtm->rtm_type == r->r_action && + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e6831adb8..21205362f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.33 1998/03/19 08:34:08 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -430,11 +430,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) qp->ihlen = ihl; memcpy(qp->iph, iph, ihl+8); } - del_timer(&qp->timer); - qp->timer.expires = jiffies + sysctl_ipfrag_time; /* about 30 seconds */ - qp->timer.data = (unsigned long) qp; /* pointer to queue */ - qp->timer.function = ip_expire; /* expire function */ - add_timer(&qp->timer); + /* about 30 seconds */ + mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); } else { /* If we failed to create it, then discard the frame. */ if ((qp = ip_create(skb, iph)) == NULL) { diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 4eb41c325..b364f66de 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -683,11 +683,6 @@ static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; - } else if (ftmp->fw_via.s_addr) { - if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) - ftmp->fw_viadev = (struct device *) -1; - else - memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -732,11 +727,6 @@ static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; - } else if (ftmp->fw_via.s_addr) { - if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) - ftmp->fw_viadev = (struct device *) -1; - else - memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index dc367a289..cf92b1638 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -1819,13 +1819,9 @@ int ip_masq_ctl(int optname, void *arg, int arglen) struct ip_fw_masqctl *mctl = arg; int ret = EINVAL; - ip_masq_lockz(&__ip_masq_lock, &masq_wait, 0); - if (1) /* (mctl->mctl_action == IP_MASQ_MOD_CTL) */ ret = ip_masq_mod_ctl(optname, mctl, arglen); - ip_masq_unlockz(&__ip_masq_lock, &masq_wait, 0); - return ret; } diff --git a/net/ipv4/ip_masq_autofw.c b/net/ipv4/ip_masq_autofw.c index 30493d4cd..27b98bb03 100644 --- a/net/ipv4/ip_masq_autofw.c +++ b/net/ipv4/ip_masq_autofw.c @@ -119,10 +119,8 @@ static __inline__ void ip_autofw_update_out (__u32 who, __u32 where, __u16 port, { if (af->flags & IP_AUTOFW_USETIME) { - if (af->timer.expires) - del_timer(&af->timer); - af->timer.expires=jiffies+IP_AUTOFW_EXPIRE; - add_timer(&af->timer); + mod_timer(&af->timer, + jiffies+IP_AUTOFW_EXPIRE); } af->flags|=IP_AUTOFW_ACTIVE; af->lastcontact=where; @@ -139,9 +137,7 @@ static __inline__ void ip_autofw_update_in (__u32 where, __u16 port, __u16 proto af=ip_autofw_check_range(where, port,protocol); if (af) { - del_timer(&af->timer); - af->timer.expires=jiffies+IP_AUTOFW_EXPIRE; - add_timer(&af->timer); + mod_timer(&af->timer, jiffies+IP_AUTOFW_EXPIRE); } } #endif diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index 2265161f3..f6a50dfc6 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -275,7 +275,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) IP_MASQ_DEBUG(1, "searching mmod_name \"%s\"\n", mmod_name); - for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next) { + for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next_reg) { if (mmod->mmod_ctl && *(mmod_name) && (strcmp(mmod_name, mmod->mmod_name)==0)) { /* HIT */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 63fbbfe1e..69179738e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -81,46 +81,24 @@ int sysctl_ip_dynaddr = 0; int ip_id_count = 0; -int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, - struct ip_options *opt) +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) { - struct rtable *rt; - u32 final_daddr = daddr; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + +void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) +{ + struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; - int err; - if (opt && opt->srr) - daddr = opt->faddr; - - err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) | - RTO_CONN | sk->localroute, sk->bound_dev_if); - if (err) - { - ip_statistics.IpOutNoRoutes++; - return err; - } - - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - ip_statistics.IpOutNoRoutes++; - return -ENETUNREACH; - } - - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, (rt->u.dst.dev->hard_header_len+15)&~15); - - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - + /* Build the IP header. */ if (opt) - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen); + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); else - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr)); + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); iph->version = 4; iph->ihl = 5; @@ -133,92 +111,19 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->protocol; + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); skb->nh.iph = iph; - skb->h.raw = (unsigned char*)(iph+1); - if (opt && opt->optlen) - { + if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; - skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt, 0); + ip_options_build(skb, opt, daddr, rt, 0); } - - ip_rt_put(rt); - return 0; -} -/* - * This routine builds the appropriate hardware/IP headers for - * the routine. - */ -int ip_build_header(struct sk_buff *skb, struct sock *sk) -{ - struct rtable *rt; - struct ip_options *opt = sk->opt; - u32 daddr = sk->daddr; - u32 final_daddr = daddr; - struct iphdr *iph; - int err; - - if (opt && opt->srr) - daddr = opt->faddr; - - rt = (struct rtable*)sk->dst_cache; - - if (!rt || rt->u.dst.obsolete) { - sk->dst_cache = NULL; - ip_rt_put(rt); - err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | - RTO_CONN | sk->localroute, sk->bound_dev_if); - if (err) - return err; - sk->dst_cache = &rt->u.dst; - } - - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - sk->dst_cache = NULL; - ip_rt_put(rt); - ip_statistics.IpOutNoRoutes++; - return -ENETUNREACH; - } - - skb->dst = dst_clone(sk->dst_cache); - skb_reserve(skb, MAX_HEADER); - - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - - if (opt) - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr)); - - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->ip_tos; - iph->frag_off = 0; - if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->u.dst.mxlock&(1<<RTAX_MTU))) - iph->frag_off |= htons(IP_DF); - iph->ttl = sk->ip_ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = sk->protocol; - skb->nh.iph = iph; - skb->h.raw = (unsigned char*)(iph+1); - - if (!opt || !opt->optlen) - return 0; - iph->ihl += opt->optlen>>2; - skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt, 0); + ip_send_check(iph); - return 0; + /* Send it out. */ + skb->dst->output(skb); } int __ip_finish_output(struct sk_buff *skb) @@ -322,78 +227,101 @@ int ip_acct_output(struct sk_buff *skb) } #endif -/* - * Generate a checksum for an outgoing IP datagram. - */ - -void ip_send_check(struct iphdr *iph) -{ - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -} - - - -/* - * Queues a packet to be sent, and starts the transmitter if necessary. +/* Queues a packet to be sent, and starts the transmitter if necessary. * This routine also needs to put in the total length and compute the - * checksum + * checksum. We use to do this in two stages, ip_build_header() then + * this, but that scheme created a mess when routes disappeared etc. + * So we do it all here, and the TCP send engine has been changed to + * match. (No more unroutable FIN disasters, etc. wheee...) This will + * most likely make other reliable transport layers above IP easier + * to implement under Linux. */ - void ip_queue_xmit(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct ip_options *opt = sk->opt; + struct rtable *rt; struct device *dev; + struct iphdr *iph; unsigned int tot_len; - struct iphdr *iph = skb->nh.iph; - tot_len = skb->len; - iph->tot_len = htons(tot_len); - iph->id = htons(ip_id_count++); + /* Make sure we can route this packet. */ + rt = (struct rtable *) sk->dst_cache; + if(rt == NULL || rt->u.dst.obsolete) { + u32 daddr; - if (rt->u.dst.obsolete) { - /* Ugly... ugly... but what can I do? - Essentially it is "ip_reroute_output" function. --ANK - */ - struct rtable *nrt; - if (ip_route_output(&nrt, rt->key.dst, rt->key.src, - rt->key.tos | RTO_CONN, - sk?sk->bound_dev_if:0)) - goto drop; - skb->dst = &nrt->u.dst; + sk->dst_cache = NULL; ip_rt_put(rt); - rt = nrt; + + /* Use correct destination address if we have options. */ + daddr = sk->daddr; + if(opt && opt->srr) + daddr = opt->faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times itself + * out. + */ + if(ip_route_output(&rt, daddr, sk->saddr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) + goto drop; + sk->dst_cache = &rt->u.dst; + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto no_route; + + /* We have a route, so grab a reference. */ + skb->dst = dst_clone(sk->dst_cache); + + /* OK, we know where to send it, allocate and build IP header. */ + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + iph->version = 4; + iph->ihl = 5; + iph->tos = sk->ip_tos; + iph->frag_off = 0; + if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU))) + iph->frag_off |= __constant_htons(IP_DF); + iph->ttl = sk->ip_ttl; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->protocol; + skb->nh.iph = iph; + /* Transport layer set skb->h.foo itself. */ + + if(opt && opt->optlen) { + iph->ihl += opt->optlen >> 2; + ip_options_build(skb, opt, sk->daddr, rt, 0); } + tot_len = skb->len; + iph->tot_len = htons(tot_len); + iph->id = htons(ip_id_count++); + dev = rt->u.dst.dev; - if (call_out_firewall(PF_INET, dev, iph, NULL,&skb) < FW_ACCEPT) + if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; #ifdef CONFIG_NET_SECURITY - /* - * Add an IP checksum (must do this before SECurity because - * of possible tunneling) + /* Add an IP checksum (must do this before SECurity because + * of possible tunneling). */ - ip_send_check(iph); - - if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb)<FW_ACCEPT) + if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb) < FW_ACCEPT) goto drop; - iph = skb->nh.iph; - /* don't update tot_len, as the dev->mtu is already decreased */ + /* Don't update tot_len, as the dev->mtu is already decreased. */ #endif - + /* This can happen when the transport layer has segments queued + * with a cached route, and by the time we get here things are + * re-routed to a device with a different MTU than the original + * device. Sick, but we must cover it. + */ if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) { struct sk_buff *skb2; - /* ANK: It is almost impossible, but - * if you loaded module device with hh_len > MAX_HEADER, - * and if a route changed to this device, - * and if (uh...) TCP had segments queued on this route... - */ - skb2 = skb_realloc_headroom(skb, (dev->hard_header_len+15)&~15); + + skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15); kfree_skb(skb); if (skb2 == NULL) return; @@ -401,40 +329,35 @@ void ip_queue_xmit(struct sk_buff *skb) iph = skb->nh.iph; } - /* - * Do we need to fragment. Again this is inefficient. - * We need to somehow lock the original buffer and use - * bits of it. + /* Do we need to fragment. Again this is inefficient. We + * need to somehow lock the original buffer and use bits of it. */ - if (tot_len > rt->u.dst.pmtu) goto fragment; #ifndef CONFIG_NET_SECURITY - /* - * Add an IP checksum - */ - + /* Add an IP checksum. */ ip_send_check(iph); #endif - - if (sk) - skb->priority = sk->priority; + skb->priority = sk->priority; skb->dst->output(skb); return; fragment: - if ((iph->frag_off & htons(IP_DF))) - { + if ((iph->frag_off & htons(IP_DF)) != 0) { printk(KERN_DEBUG "sending pkt_too_big to self\n"); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(rt->u.dst.pmtu)); goto drop; } - ip_fragment(skb, skb->dst->output); return; +no_route: + sk->dst_cache = NULL; + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; + /* Fall through... */ drop: kfree_skb(skb); } @@ -948,14 +871,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) reply->dst = &rt->u.dst; skb_reserve(reply, (rt->u.dst.dev->hard_header_len+15)&~15); - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - + /* Now build the IP header. */ reply->nh.iph = iph = (struct iphdr *)skb_put(reply, iphlen); iph->version = 4; @@ -966,6 +882,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = skb->nh.iph->protocol; + iph->id = htons(ip_id_count++); ip_options_build(reply, &replyopts.opt, daddr, rt, 0); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 221207205..0ea231adf 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,7 +59,7 @@ static inline void get__openreq(struct sock *sk, struct open_request *req, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu", i, (long unsigned int)req->af.v4_req.loc_addr, - ntohs(sk->dummy_th.source), + ntohs(sk->sport), (long unsigned int)req->af.v4_req.rmt_addr, req->rmt_port, TCP_SYN_RECV, @@ -83,8 +83,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) dest = sp->daddr; src = sp->rcv_saddr; - destp = sp->dummy_th.dest; - srcp = sp->dummy_th.source; + destp = sp->dport; + srcp = sp->sport; /* FIXME: The fact that retransmit_timer occurs as a field * in two different parts of the socket structure is, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ce4a95f4..464090776 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -104,6 +104,7 @@ int ip_rt_redirect_load = HZ/50; int ip_rt_redirect_silence = ((HZ/50) << (9+1)); int ip_rt_error_cost = HZ; int ip_rt_error_burst = 5*HZ; +int ip_rt_gc_elasticity = 8; static unsigned long rt_deadline = 0; @@ -398,10 +399,10 @@ static int rt_garbage_collect(void) last_gc = now; if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; + expire = ip_rt_gc_timeout>>1; out: - expire >>= 1; + expire -= expire>>ip_rt_gc_elasticity; end_bh_atomic(); return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size); } @@ -1740,6 +1741,9 @@ ctl_table ipv4_route_table[] = { {NET_IPV4_ROUTE_ERROR_BURST, "error_burst", &ip_rt_error_burst, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity", + &ip_rt_gc_elasticity, sizeof(int), 0644, NULL, + &proc_dointvec}, {0} }; #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 767c5d00b..da64fc186 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ extern int sysctl_tcp_cong_avoidance; extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_max_ka_probes; @@ -104,6 +105,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling", &sysctl_tcp_window_scaling, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_SACK, "tcp_sack", + &sysctl_tcp_sack, sizeof(int), 0644, NULL, + &proc_dointvec}, {NET_IPV4_TCP_VEGAS_CONG_AVOID, "tcp_vegas_cong_avoid", &sysctl_tcp_cong_avoidance, sizeof(int), 0644, NULL, &tcp_sysctl_congavoid }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b20df83d2..d57b7e3ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.96 1998/03/16 02:25:55 davem Exp $ + * Version: $Id: tcp.c,v 1.104 1998/03/22 22:10:30 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -668,7 +668,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) return sock_error(sk); if((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if(sk->keepopen) + if(sk->keepopen && !(flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, tsk, 0); return -EPIPE; } @@ -733,15 +733,25 @@ static void wait_for_tcp_memory(struct sock * sk) int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now = sk->mss; int err = 0; int copied = 0; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if((err = wait_for_tcp_connect(sk, flags)) != 0) return err; + /* The socket is locked, nothing can change the state of pending + * SACKs or IP options. + */ + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= (sk->opt->optlen); + /* Ok commence sending. */ while(--iovlen >= 0) { int seglen=iov->iov_len; @@ -769,22 +779,19 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ if (tp->send_head && !(flags & MSG_OOB)) { skb = sk->write_queue.prev; - copy = skb->tail - - ((unsigned char *)(skb->h.th) + - tp->tcp_header_len); - /* This window_seq test is somewhat dangerous - * If the remote does SWS avoidance we should + copy = skb->len; + /* If the remote does SWS avoidance we should * queue the best we can if not we should in * fact send multiple packets... - * a method for detecting this would be most - * welcome + * A method for detecting this would be most + * welcome. */ if (skb_tailroom(skb) > 0 && - (sk->mss - copy) > 0 && + (mss_now - copy) > 0 && tp->snd_nxt < skb->end_seq) { - int last_byte_was_odd = (copy & 1); + int last_byte_was_odd = (copy % 4); - copy = sk->mss - copy; + copy = mss_now - copy; if(copy > skb_tailroom(skb)) copy = skb_tailroom(skb); if(copy > seglen) @@ -793,12 +800,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) if(copy_from_user(skb_put(skb, copy), from, copy)) err = -EFAULT; - skb->csum = csum_partial( - (((unsigned char *)skb->h.th) + - tp->tcp_header_len), - (skb->tail - - (((unsigned char *)skb->h.th) + - tp->tcp_header_len)), 0); + skb->csum = csum_partial(skb->data, + skb->len, 0); } else { skb->csum = csum_and_copy_from_user( @@ -810,6 +813,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from += copy; copied += copy; seglen -= copy; + if(!seglen && !iovlen) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; continue; } } @@ -828,18 +833,17 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); if(copy >= (tp->max_window >> 1)) - copy = min(copy, sk->mss); + copy = min(copy, mss_now); else - copy = sk->mss; + copy = mss_now; if(copy > seglen) copy = seglen; - tmp = MAX_HEADER + sk->prot->max_header + - sizeof(struct sk_buff) + 15; + tmp = MAX_HEADER + sk->prot->max_header + 15; queue_it = 0; - if (copy < min(sk->mss, tp->max_window >> 1) && + if (copy < min(mss_now, tp->max_window >> 1) && !(flags & MSG_OOB)) { - tmp += min(sk->mss, tp->max_window); + tmp += min(mss_now, tp->max_window); /* What is happening here is that we want to * tack on later members of the users iovec @@ -869,35 +873,34 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) continue; } - /* FIXME: we need to optimize this. - * Perhaps some hints here would be good. - */ - tmp = tp->af_specific->build_net_header(sk, skb); - if (tmp < 0) { - kfree_skb(skb); - err = tmp; - goto do_interrupted; - } - - skb->h.th =(struct tcphdr *) - skb_put(skb,tp->tcp_header_len); - seglen -= copy; - tcp_build_header_data(skb->h.th, sk, seglen || iovlen); + /* Prepare control bits for TCP header creation engine. */ + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | + ((!seglen && !iovlen) ? + TCPCB_FLAG_PSH : 0)); + TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { - skb->h.th->urg = 1; - skb->h.th->urg_ptr = ntohs(copy); - } - + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; + TCP_SKB_CB(skb)->urg_ptr = copy; + } else + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* TCP data bytes are SKB_PUT() on top, later + * TCP+IP+DEV headers are SKB_PUSH()'d beneath. + * Reserve header space and checksum the data. + */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); from += copy; copied += copy; - tp->write_seq += copy; + skb->seq = tp->write_seq; + skb->end_seq = skb->seq + copy; + /* This advances tp->write_seq for us. */ tcp_send_skb(sk, skb, queue_it); } } @@ -913,7 +916,8 @@ do_sock_err: do_shutdown: if(copied) return copied; - send_sig(SIGPIPE, current, 0); + if (!(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); return -EPIPE; do_interrupted: if(copied) @@ -1044,9 +1048,20 @@ static void cleanup_rbuf(struct sock *sk, int copied) /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". */ - if((copied > 0) && - (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp))) - tcp_read_wakeup(sk); + if(copied > 0) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rcv_window_now = tcp_receive_window(tp); + + /* We won't be raising the window any further than + * the window-clamp allows. Our window selection + * also keeps things a nice multiple of MSS. These + * checks are necessary to prevent spurious ACKs + * which don't advertize a larger window. + */ + if((copied >= rcv_window_now) && + ((rcv_window_now + sk->mss) <= tp->window_clamp)) + tcp_read_wakeup(sk); + } } @@ -1319,12 +1334,8 @@ static int tcp_close_state(struct sock *sk, int dead) * that we won't make the old 4*rto = almost no time - whoops * reset mistake. */ - if(dead && ns==TCP_FIN_WAIT2) { - if(sk->timer.prev && del_timer(&sk->timer)) - add_timer(&sk->timer); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } + if(dead && ns == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); return send_fin; } @@ -1448,12 +1459,8 @@ void tcp_close(struct sock *sk, unsigned long timeout) /* Now that the socket is dead, if we are in the FIN_WAIT2 state * we may need to set up a timer. */ - if (sk->state==TCP_FIN_WAIT2) { - if(sk->timer.prev && del_timer(&sk->timer)) - add_timer(&sk->timer); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } + if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); sk->dead = 1; release_sock(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4b7dcc9e9..1c34e6693 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $ + * Version: $Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -72,9 +72,10 @@ extern int sysctl_tcp_fin_timeout; */ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; +int sysctl_tcp_hoe_retransmits = 1; int sysctl_tcp_cong_avoidance; -int sysctl_tcp_hoe_retransmits; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; @@ -177,7 +178,6 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) * some modification to the RTO calculation that takes delayed * ack bais into account? This needs serious thought. -- erics */ - static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { if (tp->rto > 120*HZ) @@ -187,7 +187,6 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) } /* WARNING: this must not be called if tp->saw_timestamp was false. */ - extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) { /* From draft-ietf-tcplw-high-performance: the correct @@ -226,10 +225,7 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) return 0; } -/* - * This functions checks to see if the tcp header is actually acceptable. - */ - +/* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { if (seq == tp->rcv_nxt) @@ -238,11 +234,7 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) return __tcp_sequence(tp, seq, end_seq); } -/* - * When we get a reset we do this. This probably is a tcp_output routine - * really. - */ - +/* When we get a reset we do this. */ static void tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; @@ -264,14 +256,36 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb) sk->state_change(sk); } -/* - * Look for tcp options. Normally only called on SYN and SYNACK packets. - * But, this can also be called on packets in the established flow when - * the fast version below fails. - * FIXME: surely this can be more efficient. -- erics +/* This tags the retransmission queue when SACKs arrive. */ +static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int i = nsacks; + + while(i--) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + /* We play conservative, we don't allow SACKS to partially + * tag a sequence space. + */ + if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq)) + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + skb = skb->next; + } + sp++; /* Move on to the next SACK block. */ + } +} + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. */ - -void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) +void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); @@ -281,49 +295,68 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) while(length>0) { int opcode=*ptr++; - int opsize=*ptr++; - if (length - opsize < 0) /* Don't parse partial options */ - break; - switch(opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - ptr--; /* the opsize=*ptr++ above was a mistake */ - continue; - - default: - if(opsize<=2) /* Avoid silly options looping forever */ - return; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ switch(opcode) { - case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) { - tp->in_mss = ntohs(*(__u16 *)ptr); - if (tp->in_mss == 0) - tp->in_mss = 536; + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn) { + tp->in_mss = ntohs(*(__u16 *)ptr); + if (tp->in_mss == 0) + tp->in_mss = 536; + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn) + if (!no_fancy && sysctl_tcp_window_scaling) { + tp->wscale_ok = 1; + tp->snd_wscale = *(__u8 *)ptr; } - break; - case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn) - if (!no_fancy && sysctl_tcp_window_scaling) { - tp->wscale_ok = 1; - tp->snd_wscale = *(__u8 *)ptr; - } - break; - case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { - /* Cheaper to set again then to - * test syn. Optimize this? - */ - if (sysctl_tcp_timestamps && !no_fancy) { - tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); - } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if (sysctl_tcp_timestamps && !no_fancy) { + tp->tstamp_ok = 1; + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn) { + if (sysctl_tcp_sack && !no_fancy) { + tp->sack_ok = 1; + tp->num_sacks = 0; + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + sysctl_tcp_sack && (sk != NULL) && !th->syn) { + int sack_bytes = opsize - TCPOLEN_SACK_BASE; + + if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { + int num_sacks = sack_bytes >> 3; + struct tcp_sack_block *sackp; + + sackp = (struct tcp_sack_block *)ptr; + tcp_sacktag_write_queue(sk, sackp, num_sacks); } - break; - } + } + }; ptr+=opsize-2; length-=opsize; }; @@ -331,13 +364,11 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) } /* Fast parse options. This hopes to only see timestamps. - * If it is wrong it falls back on tcp_parse_option(). - * This should probably get extended for timestamps as well. - * Assembly code anyone? -- erics + * If it is wrong it falls back on tcp_parse_options(). */ -static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp) +static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) { - /* If we didn't send out any options ignore them all */ + /* If we didn't send out any options ignore them all. */ if (tp->tcp_header_len == sizeof(struct tcphdr)) return 0; if (th->doff == sizeof(struct tcphdr)>>2) { @@ -353,13 +384,14 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt * return 1; } } - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); return 1; } -#define FLAG_DATA 0x01 -#define FLAG_WIN_UPDATE 0x02 -#define FLAG_DATA_ACKED 0x04 +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ static __inline__ void clear_fast_retransmit(struct sock *sk) { @@ -372,11 +404,9 @@ static __inline__ void clear_fast_retransmit(struct sock *sk) tp->dup_acks = 0; } -/* - * NOTE: This code assumes that tp->dup_acks gets cleared when a +/* NOTE: This code assumes that tp->dup_acks gets cleared when a * retransmit timer fires. */ - static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); @@ -407,7 +437,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh + 3; tp->high_seq = tp->snd_nxt; - tcp_do_retransmit(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } @@ -425,7 +455,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * block on duplicate fast retransmits, and if requested * we do Hoe style secondary fast retransmits. */ - if (!before(ack,tp->high_seq) || (not_dup&FLAG_DATA) != 0) { + if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { /* Once we have acked all the packets up to high_seq * we are done this fast retransmit phase. * Alternatively data arrived. In this case we @@ -438,7 +468,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) /* After we have cleared up to high_seq we can * clear the Floyd style block. */ - if (after(ack,tp->high_seq)) + if (after(ack, tp->high_seq)) tp->high_seq = 0; } else if (tp->dup_acks >= 3) { if (sysctl_tcp_hoe_retransmits) { @@ -455,10 +485,9 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * the only way to get here without advancing * from snd_una is if this was a window update. */ - if (ack != tp->snd_una && before(ack,tp->high_seq)) { - tcp_do_retransmit(sk, 0); - tcp_reset_xmit_timer(sk, TIME_RETRANS, - tp->rto); + if (ack != tp->snd_una && before(ack, tp->high_seq)) { + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } else { /* Reno style. We didn't ack the whole @@ -589,9 +618,9 @@ static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) } } - -static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, - __u32 *seq_rtt) +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, + __u32 *seq, __u32 *seq_rtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; @@ -600,8 +629,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the - * other end. + * discard it as it's confirmed to have arrived at + * the other end. */ if (after(skb->end_seq, ack)) break; @@ -613,26 +642,22 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if(!skb->h.th->syn) - acked = FLAG_DATA_ACKED; - - /* FIXME: packet counting may break if we have to - * do packet "repackaging" for stacks that don't - * like overlapping packets. - */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + acked |= FLAG_RETRANS_DATA_ACKED; + } else { + tp->retrans_head = NULL; + } tp->packets_out--; - *seq = skb->seq; *seq_rtt = now - skb->when; - skb_unlink(skb); - kfree_skb(skb); } if (acked) tp->retrans_head = NULL; - return acked; } @@ -686,41 +711,23 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { - struct sk_buff *skb; - long when; - - skb = skb_peek(&sk->write_queue); - when = tp->rto - (jiffies - skb->when); - - /* FIXME: This assumes that when we are retransmitting - * we should only ever respond with one packet. - * This means congestion windows should not grow - * during recovery. In 2.0.X we allow the congestion - * window to grow. It is not clear to me which - * decision is correct. The RFCs should be double - * checked as should the behavior of other stacks. - * Also note that if we do want to allow the - * congestion window to grow during retransmits - * we have to fix the call to congestion window - * updates so that it works during retransmission. + struct sk_buff *skb = skb_peek(&sk->write_queue); + long when = tp->rto - (jiffies - skb->when); + + /* Some data was ACK'd, if still retransmitting (due to a + * timeout), resend more of the retransmit queue. The + * congestion window is handled properly by that code. */ if (tp->retransmits) { tp->retrans_head = NULL; - - /* This is tricky. We are retransmiting a - * segment of a window when congestion occured. - */ - tcp_do_retransmit(sk, 0); + tcp_xmit_retransmit_queue(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } } -/* - * This routine deals with incoming acks, but not outgoing ones. - */ - +/* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack_seq, u32 ack, int len) { @@ -805,7 +812,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ - if (flag & FLAG_DATA_ACKED) { + if ((flag & FLAG_DATA_ACKED) && + !(flag & FLAG_RETRANS_DATA_ACKED)) { tp->backoff = 0; tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); @@ -923,9 +931,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, } else { if(th->ack) { /* In this case we must reset the TIMEWAIT timer. */ - del_timer(&tw->timer); - tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; - add_timer(&tw->timer); + mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN); } } return 0; /* Discard the frame. */ @@ -981,9 +987,10 @@ void tcp_time_wait(struct sock *sk) tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; + tw->sport = sk->sport; + tw->dport = sk->dport; tw->family = sk->family; - tw->source = sk->dummy_th.source; - tw->dest = sk->dummy_th.dest; + tw->reuse = sk->reuse; tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; @@ -1098,6 +1105,175 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) }; } +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) +{ + int this_sack, num_sacks = tp->num_sacks; + struct tcp_sack_block *swalk = &tp->selective_acks[0]; + + /* If more than one SACK block, see if the recent change to SP eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + if(num_sacks != 1) { + for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { + if(swalk == sp) + continue; + + /* First case, bottom of SP moves into top of the + * sequence space of SWALK. + */ + if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { + sp->start_seq = swalk->start_seq; + goto coalesce; + } + /* Second case, top of SP moves into bottom of the + * sequence space of SWALK. + */ + if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { + sp->end_seq = swalk->end_seq; + goto coalesce; + } + } + } + /* SP is the only SACK, or no coalescing cases found. */ + return; + +coalesce: + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + for(this_sack += 1; this_sack < num_sacks; this_sack++, swalk++) { + struct tcp_sack_block *next = (swalk + 1); + swalk->start_seq = next->start_seq; + swalk->end_seq = next->end_seq; + } + tp->num_sacks--; +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + + /* Optimize for the common case, new ofo frames arrive + * "in order". ;-) This also satisfies the requirements + * of RFC2018 about ordering of SACKs. + */ + if(sp->end_seq == skb->seq) { + sp->end_seq = skb->end_seq; + tcp_sack_maybe_coalesce(tp, sp); + } else if(sp->start_seq == skb->end_seq) { + /* Re-ordered arrival, in this case, can be optimized + * as well. + */ + sp->start_seq = skb->seq; + tcp_sack_maybe_coalesce(tp, sp); + } else { + int cur_sacks = tp->num_sacks; + int max_sacks = (tp->tstamp_ok ? 3 : 4); + + /* Oh well, we have to move things around. + * Try to find a SACK we can tack this onto. + */ + if(cur_sacks > 1) { + struct tcp_sack_block *swap = sp + 1; + int this_sack; + + for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { + if((swap->end_seq == skb->seq) || + (swap->start_seq == skb->end_seq)) { + if(swap->end_seq == skb->seq) + swap->end_seq = skb->end_seq; + else + swap->start_seq = skb->seq; + tcp_sack_swap(sp, swap); + tcp_sack_maybe_coalesce(tp, sp); + return; + } + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + */ + while(cur_sacks >= 1) { + struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; + struct tcp_sack_block *prev = (this - 1); + this->start_seq = prev->start_seq; + this->end_seq = prev->end_seq; + cur_sacks--; + } + + /* Build head SACK, and we're done. */ + sp->start_seq = skb->seq; + sp->end_seq = skb->end_seq; + if(tp->num_sacks < max_sacks) + tp->num_sacks++; + } +} + +static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + /* We know this removed SKB will eat from the front of a SACK. */ + for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { + if(sp->start_seq == skb->seq) + break; + } + + /* This should only happen if so many SACKs get built that some get + * pushed out before we get here, or we eat some in sequence packets + * which are before the first SACK block. + */ + if(this_sack >= num_sacks) + return; + + sp->start_seq = skb->end_seq; + if(!before(sp->start_seq, sp->end_seq)) { + /* Zap this SACK, by moving forward any other SACKS. */ + for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { + struct tcp_sack_block *next = (sp + 1); + sp->start_seq = next->start_seq; + sp->end_seq = next->end_seq; + } + tp->num_sacks--; + } +} + +static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) { + if(sp->end_seq == old_skb->end_seq) + break; + } + if(this_sack >= num_sacks) + return; + sp->end_seq = new_skb->end_seq; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -1119,6 +1295,8 @@ static void tcp_ofo_queue(struct sock *sk) SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); + if(tp->sack_ok) + tcp_sack_remove_skb(tp, skb); skb_unlink(skb); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; @@ -1142,13 +1320,23 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) dst_confirm(sk->dst_cache); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; - if(skb->h.th->fin) + if(skb->h.th->fin) { tcp_fin(skb, sk, skb->h.th); - else + } else { tp->delayed_acks++; + + /* Tiny-grams with PSH set make us ACK quickly. */ + if(skb->h.th->psh && (skb->len < (sk->mss >> 1))) + tp->ato = HZ/50; + } + /* This may have eaten into a SACK block. */ + if(tp->sack_ok && tp->num_sacks) + tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | + (0x10 << 16) | + tp->snd_wnd); return; } @@ -1180,25 +1368,44 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->rcv_nxt, skb->seq, skb->end_seq); if (skb_peek(&tp->out_of_order_queue) == NULL) { + /* Initial out of order segment, build 1 SACK. */ + if(tp->sack_ok) { + tp->num_sacks = 1; + tp->selective_acks[0].start_seq = skb->seq; + tp->selective_acks[0].end_seq = skb->end_seq; + } skb_queue_head(&tp->out_of_order_queue,skb); } else { for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ - if (skb->seq == skb1->seq && skb->len >= skb1->len) { - skb_append(skb1, skb); - skb_unlink(skb1); - kfree_skb(skb1); + if (skb->seq == skb1->seq) { + if (skb->len >= skb1->len) { + if(tp->sack_ok) + tcp_sack_extend(tp, skb1, skb); + skb_append(skb1, skb); + skb_unlink(skb1); + kfree_skb(skb1); + } else { + /* A duplicate, smaller than what is in the + * out-of-order queue right now, toss it. + */ + kfree_skb(skb); + } break; } if (after(skb->seq, skb1->seq)) { skb_append(skb1,skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); break; } /* See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&tp->out_of_order_queue)) { skb_queue_head(&tp->out_of_order_queue,skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); break; } } @@ -1244,8 +1451,8 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) static void tcp_data_snd_check(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); if ((skb = tp->send_head)) { if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && @@ -1273,6 +1480,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets + * - must send an ACK if we have any SACKs * * With an extra heuristic to handle loss of packet * situations and also helping the sender leave slow @@ -1283,8 +1491,10 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || - /* We entered "quick ACK" mode */ - tcp_in_quickack_mode(tp)) { + /* We entered "quick ACK" mode or... */ + tcp_in_quickack_mode(tp) || + /* We have pending SACKs */ + (tp->sack_ok && tp->num_sacks)) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -1446,7 +1656,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* * RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(th,tp)) { + if (tcp_fast_parse_options(sk, th, tp)) { if (tp->saw_tstamp) { if (tcp_paws_discard(tp)) { if (!th->rst) { @@ -1460,10 +1670,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, flg = *(((u32 *)th) + 3); - /* - * pred_flags is 0x5?10 << 16 + snd_wnd + /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made - * ? will be 0 else it will be !0 + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 else it will be !0 * (when there are holes in the receive * space for instance) */ @@ -1498,6 +1708,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ sk->data_ready(sk, 0); tcp_delack_estimator(tp); + + /* Tiny-grams with PSH set make us ACK quickly. */ + if(th->psh && (skb->len < (sk->mss >> 1))) + tp->ato = HZ/50; + tp->delayed_acks++; __tcp_ack_snd_check(sk); return 0; @@ -1703,7 +1918,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); if (tp->wscale_ok == 0) { tp->snd_wscale = tp->rcv_wscale = 0; @@ -1712,7 +1927,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tp->tstamp_ok) { tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); } else tp->tcp_header_len = sizeof(struct tcphdr); if (tp->saw_tstamp) { @@ -1745,7 +1959,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, sk->mss = min(sk->mss, real_mss); } - sk->dummy_th.dest = th->source; + sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) { @@ -1763,7 +1977,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * tcp_connect. */ tcp_set_state(sk, TCP_SYN_RECV); - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); if (tp->saw_tstamp) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = jiffies; @@ -1788,7 +2002,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Note that this really has to be here and not later for PAWS * (RFC1323) to work. */ - if (tcp_fast_parse_options(th,tp)) { + if (tcp_fast_parse_options(sk, th, tp)) { /* NOTE: assumes saw_tstamp is never set if we didn't * negotiate the option. tcp_fast_parse_options() must * guarantee this. @@ -1849,7 +2063,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_SYN_RECV: if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); - sk->dummy_th.dest=th->source; + sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 91f21ff75..ee53f47d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.119 1998/03/22 19:14:47 davem Exp $ * * IPv4 specific functions * @@ -62,16 +62,12 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; extern int sysctl_tcp_syncookies; extern int sysctl_ip_dynaddr; /* Check TCP sequence numbers in ICMP packets. */ -#define ICMP_PARANOIA 1 -#ifndef ICMP_PARANOIA -#define ICMP_MIN_LENGTH 4 -#else #define ICMP_MIN_LENGTH 8 -#endif static void tcp_v4_send_reset(struct sk_buff *skb); @@ -120,7 +116,7 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk) __u32 laddr = sk->rcv_saddr; __u16 lport = sk->num; __u32 faddr = sk->daddr; - __u16 fport = sk->dummy_th.dest; + __u16 fport = sk->dport; return tcp_hashfn(laddr, lport, faddr, fport); } @@ -365,7 +361,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, sk = TCP_RHASH(sport); if(sk && sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) @@ -377,7 +373,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, hash = tcp_hashfn(daddr, hnum, saddr, sport); for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) { @@ -389,7 +385,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) @@ -456,8 +452,8 @@ pass2: continue; score++; } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) + if(s->dport) { + if(s->dport != rnum) continue; score++; } @@ -496,12 +492,7 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } -/* - * From tcp.c - */ - -/* - * Check that a TCP address is unique, don't allow multiple +/* Check that a TCP address is unique, don't allow multiple * connects to/from the same address. Actually we can optimize * quite a bit, since the socket about to connect is still * in TCP_CLOSE, a tcp_bind_bucket for the local port he will @@ -509,8 +500,7 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) * The good_socknum and verify_bind scheme we use makes this * work. */ - -static int tcp_unique_address(struct sock *sk) +static int tcp_v4_unique_address(struct sock *sk) { struct tcp_bind_bucket *tb; unsigned short snum = sk->num; @@ -524,7 +514,7 @@ static int tcp_unique_address(struct sock *sk) /* Almost certainly the re-use port case, search the real hashes * so it actually scales. */ - sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest, + sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport, sk->rcv_saddr, snum, sk->bound_dev_if); if((sk != NULL) && (sk->state != TCP_LISTEN)) retval = 0; @@ -535,19 +525,15 @@ static int tcp_unique_address(struct sock *sk) return retval; } - -/* - * This will initiate an outgoing connection. - */ - +/* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sk_buff *buff; - int tmp; - struct tcphdr *th; - struct rtable *rt; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct sk_buff *buff; + struct rtable *rt; + int tmp; + int mss; if (sk->state != TCP_CLOSE) return(-EISCONN); @@ -567,8 +553,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm); } - dst_release(xchg(&sk->dst_cache, NULL)); - tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); if (tmp < 0) @@ -579,143 +563,52 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!tcp_unique_address(sk)) { - ip_rt_put(rt); - return -EADDRNOTAVAIL; - } - - lock_sock(sk); + dst_release(xchg(&sk->dst_cache, rt)); - /* Do this early, so there is less state to unwind on failure. */ - buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)), + buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), 0, GFP_KERNEL); - if (buff == NULL) { - release_sock(sk); - ip_rt_put(rt); - return(-ENOBUFS); - } - sk->dst_cache = &rt->u.dst; + if (buff == NULL) + return -ENOBUFS; + + /* Socket has no identity, so lock_sock() is useless. Also + * since state==TCP_CLOSE (checked above) the socket cannot + * possibly be in the hashes. TCP hash locking is only + * needed while checking quickly for a unique address. + * However, the socket does need to be (and is) locked + * in tcp_connect(). + * Perhaps this addresses all of ANK's concerns. 8-) -DaveM + */ + sk->dport = usin->sin_port; sk->daddr = rt->rt_dst; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (sk->priority == 0) - sk->priority = rt->u.dst.priority; - - sk->dummy_th.dest = usin->sin_port; - - tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, - sk->dummy_th.source, - usin->sin_port); - tp->snd_wnd = 0; - tp->snd_wl1 = 0; - tp->snd_wl2 = tp->write_seq; - tp->snd_una = tp->write_seq; - tp->rcv_nxt = 0; - - sk->err = 0; - - /* Put in the IP header and routing stuff. */ - tmp = ip_build_header(buff, sk); - if (tmp < 0) { - /* Caller has done ip_rt_put(rt) and set sk->dst_cache - * to NULL. We must unwind the half built TCP socket - * state so that this failure does not create a "stillborn" - * sock (ie. future re-tries of connect() would fail). - */ - sk->daddr = 0; - sk->saddr = sk->rcv_saddr = 0; + if (!tcp_v4_unique_address(sk)) { kfree_skb(buff); - release_sock(sk); - return(-ENETUNREACH); + return -EADDRNOTAVAIL; } - /* No failure conditions can result past this point. */ - - /* We'll fix this up when we get a response from the other end. - * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. - */ - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - - th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); - buff->h.th = th; - - memcpy(th,(void *)&(sk->dummy_th), sizeof(*th)); - /* th->doff gets fixed up below if we tack on options. */ - - buff->seq = tp->write_seq++; - th->seq = htonl(buff->seq); - tp->snd_nxt = tp->write_seq; - buff->end_seq = tp->write_seq; - th->ack = 0; - th->syn = 1; - sk->mtu = rt->u.dst.pmtu; if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || (sk->ip_pmtudisc == IP_PMTUDISC_WANT && (rt->u.dst.mxlock&(1<<RTAX_MTU)))) && - rt->u.dst.pmtu > 576) + rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) sk->mtu = 576; - if(sk->mtu < 64) + if (sk->mtu < 64) sk->mtu = 64; /* Sanity limit */ - sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len); - if(sk->user_mss) - sk->mss = min(sk->mss, sk->user_mss); - - if (sk->mss < 1) { - printk(KERN_DEBUG "intial sk->mss below 1\n"); - sk->mss = 1; /* Sanity limit */ - } - - tp->window_clamp = rt->u.dst.window; - tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, - &tp->rcv_wnd, - &tp->window_clamp, - sysctl_tcp_window_scaling, - &tp->rcv_wscale); - th->window = htons(tp->rcv_wnd); - - tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps, - sysctl_tcp_window_scaling, tp->rcv_wscale); - buff->csum = 0; - th->doff = (sizeof(*th)+ tmp)>>2; - - tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff); - - tcp_set_state(sk,TCP_SYN_SENT); + mss = sk->mtu - sizeof(struct iphdr); + if (sk->opt) + mss -= sk->opt->optlen; - /* Socket identity change complete, no longer - * in TCP_CLOSE, so enter ourselves into the - * hash tables. - */ - tcp_v4_hash(sk); - - tp->rto = rt->u.dst.rtt; - - tcp_init_xmit_timers(sk); - - /* Now works the right way instead of a hacked initial setting. */ - tp->retransmits = 0; - - skb_queue_tail(&sk->write_queue, buff); - - tp->packets_out++; - buff->when = jiffies; - - ip_queue_xmit(skb_clone(buff, GFP_KERNEL)); + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->sport, usin->sin_port); - /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - tcp_statistics.TcpActiveOpens++; - tcp_statistics.TcpOutSegs++; - - release_sock(sk); - return(0); + tcp_connect(sk, buff, mss); + return 0; } static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) @@ -724,7 +617,7 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) int retval = -EINVAL; /* Do sanity checking for sendmsg/sendto/send. */ - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT)) + if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) goto out; if (msg->msg_name) { struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; @@ -737,7 +630,7 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) if(sk->state == TCP_CLOSE) goto out; retval = -EISCONN; - if (addr->sin_port != sk->dummy_th.dest) + if (addr->sin_port != sk->dport) goto out; if (addr->sin_addr.s_addr != sk->daddr) goto out; @@ -851,9 +744,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) int code = skb->h.icmph->code; struct sock *sk; int opening; -#ifdef ICMP_PARANOIA __u32 seq; -#endif if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { icmp_statistics.IcmpInErrors++; @@ -869,7 +760,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } tp = &sk->tp_pinfo.af_tcp; -#ifdef ICMP_PARANOIA seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) { @@ -879,7 +769,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) (int)sk->state, seq, tp->snd_una, tp->snd_nxt); return; } -#endif switch (type) { case ICMP_SOURCE_QUENCH: @@ -927,7 +816,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) req = tcp_v4_search_req(tp, iph, th, &prev); if (!req) return; -#ifdef ICMP_PARANOIA if (seq != req->snt_isn) { if (net_ratelimit()) printk(KERN_DEBUG "icmp packet for openreq " @@ -935,7 +823,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) seq, req->snt_isn); return; } -#endif if (req->sk) { /* not yet accept()ed */ sk = req->sk; /* report error in accept */ } else { @@ -987,44 +874,50 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, static void tcp_v4_send_reset(struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct sk_buff *skb1; - struct tcphdr *th1; + struct tcphdr *th = skb->h.th; - if (th->rst) - return; + /* Never send a reset in response to a reset. */ + if (th->rst == 0) { + struct tcphdr *th = skb->h.th; + struct sk_buff *skb1 = ip_reply(skb, sizeof(struct tcphdr)); + struct tcphdr *th1; - skb1 = ip_reply(skb, sizeof(struct tcphdr)); - if (skb1 == NULL) - return; + if (skb1 == NULL) + return; - skb1->h.th = th1 = (struct tcphdr *)skb_put(skb1, sizeof(struct tcphdr)); - memset(th1, 0, sizeof(*th1)); - - /* Swap the send and the receive. */ - th1->dest = th->source; - th1->source = th->dest; - th1->doff = sizeof(*th1)/4; - th1->rst = 1; - - if (th->ack) - th1->seq = th->ack_seq; - else { - th1->ack = 1; - if (!th->syn) - th1->ack_seq = th->seq; - else - th1->ack_seq = htonl(ntohl(th->seq)+1); - } + skb1->h.th = th1 = (struct tcphdr *) + skb_put(skb1, sizeof(struct tcphdr)); + + /* Swap the send and the receive. */ + memset(th1, 0, sizeof(*th1)); + th1->dest = th->source; + th1->source = th->dest; + th1->doff = sizeof(*th1)/4; + th1->rst = 1; + + if (th->ack) { + th1->seq = th->ack_seq; + } else { + th1->ack = 1; + if (!th->syn) + th1->ack_seq = th->seq; + else + th1->ack_seq = htonl(ntohl(th->seq)+1); + } + skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); + th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, + skb1->nh.iph->daddr, skb1->csum); - skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); - th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, - skb1->nh.iph->daddr, skb1->csum); + /* Finish up some IP bits. */ + skb1->nh.iph->tot_len = htons(skb1->len); + ip_send_check(skb1->nh.iph); - /* Do not place TCP options in a reset. */ - ip_queue_xmit(skb1); - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; + /* All the other work was done by ip_reply(). */ + skb1->dst->output(skb1); + + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; + } } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -1055,82 +948,48 @@ int tcp_chkaddr(struct sk_buff *skb) static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) { + struct rtable *rt; + struct ip_options *opt; struct sk_buff * skb; - struct tcphdr *th; - int tmp; int mss; - skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); - if (skb == NULL) + /* First, grab a route. */ + opt = req->af.v4_req.opt; + if(ip_route_output(&rt, ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + req->af.v4_req.loc_addr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) { + ip_statistics.IpOutNoRoutes++; return; - - if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr, - req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) { - kfree_skb(skb); + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; return; } - - mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); - if (sk->user_mss) - mss = min(mss, sk->user_mss); - if(req->tstamp_ok) - mss -= TCPOLEN_TSTAMP_ALIGNED; - else - req->mss += TCPOLEN_TSTAMP_ALIGNED; - /* tcp_syn_build_options will do an skb_put() to obtain the TCP - * options bytes below. - */ - skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); + mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + if (opt) + mss -= opt->optlen; - /* Don't offer more than they did. - * This way we don't have to memorize who said what. - * FIXME: maybe this should be changed for better performance - * with syncookies. - */ - req->mss = min(mss, req->mss); + skb = tcp_make_synack(sk, &rt->u.dst, req, mss); + if (skb) { + struct tcphdr *th = skb->h.th; - if (req->mss < 1) { - printk(KERN_DEBUG "initial req->mss below 1\n"); - req->mss = 1; - } - - /* Yuck, make this header setup more efficient... -DaveM */ - memset(th, 0, sizeof(struct tcphdr)); - th->syn = 1; - th->ack = 1; #ifdef CONFIG_IP_TRANSPARENT_PROXY - th->source = req->lcl_port; /* LVE */ -#else - th->source = sk->dummy_th.source; + th->source = req->lcl_port; /* LVE */ #endif - th->dest = req->rmt_port; - skb->seq = req->snt_isn; - skb->end_seq = skb->seq + 1; - th->seq = htonl(skb->seq); - th->ack_seq = htonl(req->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = skb->dst->window; - tcp_select_initial_window(sock_rspace(sk)/2,req->mss, - &req->rcv_wnd, - &req->window_clamp, - req->wscale_ok, - &rcv_wscale); - req->rcv_wscale = rcv_wscale; + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); } - th->window = htons(req->rcv_wnd); - tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok, - req->wscale_ok,req->rcv_wscale); - skb->csum = 0; - th->doff = (sizeof(*th) + tmp)>>2; - th->check = tcp_v4_check(th, sizeof(*th) + tmp, - req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, - csum_partial((char *)th, sizeof(*th)+tmp, skb->csum)); - - ip_queue_xmit(skb); - tcp_statistics.TcpOutSegs++; + ip_rt_put(rt); } static void tcp_v4_or_free(struct open_request *req) @@ -1240,15 +1099,16 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->rcv_isn = skb->seq; - tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; - tcp_parse_options(th,&tp,want_cookie); + tcp_parse_options(NULL, th, &tp, want_cookie); req->mss = tp.in_mss; if (tp.saw_tstamp) { req->mss -= TCPOLEN_TSTAMP_ALIGNED; req->ts_recent = tp.rcv_tsval; } req->tstamp_ok = tp.tstamp_ok; + req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = th->source; @@ -1300,8 +1160,11 @@ error: /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * This function wants to be moved to a common for IPv[46] file. --ANK */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb, + int snd_mss) { struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0); @@ -1310,27 +1173,16 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, memcpy(newsk, sk, sizeof(*newsk)); newsk->sklist_next = NULL; - newsk->daddr = req->af.v4_req.rmt_addr; - newsk->rcv_saddr = req->af.v4_req.loc_addr; -#ifdef CONFIG_IP_TRANSPARENT_PROXY - newsk->num = ntohs(skb->h.th->dest); -#endif newsk->state = TCP_SYN_RECV; /* Clone the TCP header template */ -#ifdef CONFIG_IP_TRANSPARENT_PROXY - newsk->dummy_th.source = req->lcl_port; -#endif - newsk->dummy_th.dest = req->rmt_port; - newsk->dummy_th.ack = 1; - newsk->dummy_th.doff = sizeof(struct tcphdr)>>2; + newsk->dport = req->rmt_port; newsk->sock_readers = 0; atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); atomic_set(&newsk->wmem_alloc, 0); skb_queue_head_init(&newsk->write_queue); - newsk->saddr = req->af.v4_req.loc_addr; newsk->done = 0; newsk->proc = 0; @@ -1395,12 +1247,40 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->priority = 1; /* IP layer stuff */ - newsk->opt = req->af.v4_req.opt; newsk->timeout = 0; init_timer(&newsk->timer); newsk->timer.function = &net_timer; newsk->timer.data = (unsigned long) newsk; newsk->socket = NULL; + + newtp->tstamp_ok = req->tstamp_ok; + if((newtp->sack_ok = req->sack_ok) != 0) + newtp->num_sacks = 0; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; + newtp->wscale_ok = req->wscale_ok; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } + if (newtp->tstamp_ok) { + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = jiffies; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->tcp_header_len = sizeof(struct tcphdr); + } + + snd_mss -= newtp->tcp_header_len; + + if (sk->user_mss) + snd_mss = min(snd_mss, sk->user_mss); + + newsk->mss = min(req->mss, snd_mss); + } return newsk; } @@ -1409,77 +1289,58 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) { + struct ip_options *opt = req->af.v4_req.opt; struct tcp_opt *newtp; struct sock *newsk; int snd_mss; + int mtu; #ifdef NEW_LISTEN if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ #endif - newsk = tcp_create_openreq_child(sk, req, skb); - if (!newsk) - goto exit; -#ifdef NEW_LISTEN - sk->ack_backlog++; -#endif - - newtp = &(newsk->tp_pinfo.af_tcp); - - /* options / mss / route_cache */ if (dst == NULL) { struct rtable *rt; if (ip_route_output(&rt, - newsk->opt && newsk->opt->srr ? - newsk->opt->faddr : newsk->daddr, - newsk->saddr, newsk->ip_tos|RTO_CONN, 0)) { - sk_free(newsk); + opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) return NULL; - } dst = &rt->u.dst; - } - newsk->dst_cache = dst; - - snd_mss = dst->pmtu; - - /* FIXME: is mtu really the same as snd_mss? */ - newsk->mtu = snd_mss; - /* FIXME: where does mtu get used after this? */ - /* sanity check */ - if (newsk->mtu < 64) - newsk->mtu = 64; - - newtp->tstamp_ok = req->tstamp_ok; - newtp->window_clamp = req->window_clamp; - newtp->rcv_wnd = req->rcv_wnd; - newtp->wscale_ok = req->wscale_ok; - if (newtp->wscale_ok) { - newtp->snd_wscale = req->snd_wscale; - newtp->rcv_wscale = req->rcv_wscale; - } else { - newtp->snd_wscale = newtp->rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp,65535); - } - if (newtp->tstamp_ok) { - newtp->ts_recent = req->ts_recent; - newtp->ts_recent_stamp = jiffies; - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); - } else { - newtp->tcp_header_len = sizeof(struct tcphdr); } - snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr); - if (sk->user_mss) - snd_mss = min(snd_mss, sk->user_mss); +#ifdef NEW_LISTEN + sk->ack_backlog++; +#endif + + mtu = dst->pmtu; + if (mtu < 68) + mtu = 68; + snd_mss = mtu - sizeof(struct iphdr); + if (opt) + snd_mss -= opt->optlen; - /* Make sure our mtu is adjusted for headers. */ - newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len; + newsk = tcp_create_openreq_child(sk, req, skb, snd_mss); + if (!newsk) + goto exit; + + newsk->dst_cache = dst; + + newtp = &(newsk->tp_pinfo.af_tcp); + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->saddr = req->af.v4_req.loc_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->num = ntohs(skb->h.th->dest); + newsk->sport = req->lcl_port; +#endif + newsk->opt = req->af.v4_req.opt; + newsk->mtu = mtu; /* Must use the af_specific ops here for the case of IPv6 mapped. */ newsk->prot->hash(newsk); add_to_prot_sklist(newsk); + return newsk; exit: @@ -1677,106 +1538,82 @@ do_time_wait: goto discard_it; } -int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb) -{ - return ip_build_header(skb, sk); -} - -int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) +int tcp_v4_rebuild_header(struct sock *sk) { - struct rtable *rt; - struct iphdr *iph; - struct tcphdr *th; - int size; + struct rtable *rt = (struct rtable *)sk->dst_cache; + __u32 new_saddr; int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; - /* Check route */ - - rt = (struct rtable*)skb->dst; + if(rt == NULL) + return 0; - /* Force route checking if want_rewrite */ - /* The idea is good, the implementation is disguisting. - Well, if I made bind on this socket, you cannot randomly ovewrite - its source address. --ANK + /* Force route checking if want_rewrite. + * The idea is good, the implementation is disguisting. + * Well, if I made bind on this socket, you cannot randomly ovewrite + * its source address. --ANK */ if (want_rewrite) { int tmp; + struct rtable *new_rt; __u32 old_saddr = rt->rt_src; - /* Query new route */ - tmp = ip_route_connect(&rt, rt->rt_dst, 0, + /* Query new route using another rt buffer */ + tmp = ip_route_connect(&new_rt, rt->rt_dst, 0, RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); /* Only useful if different source addrs */ - if (tmp == 0 || rt->rt_src != old_saddr ) { - dst_release(skb->dst); - skb->dst = &rt->u.dst; - } else { - want_rewrite = 0; - dst_release(&rt->u.dst); + if (tmp == 0) { + /* + * Only useful if different source addrs + */ + if (new_rt->rt_src != old_saddr ) { + dst_release(sk->dst_cache); + sk->dst_cache = &new_rt->u.dst; + rt = new_rt; + goto do_rewrite; + } + dst_release(&new_rt->u.dst); } - } else + } if (rt->u.dst.obsolete) { int err; err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); if (err) { sk->err_soft=-err; - sk->error_report(skb->sk); + sk->error_report(sk); return -1; } - dst_release(skb->dst); - skb->dst = &rt->u.dst; + dst_release(xchg(&sk->dst_cache, &rt->u.dst)); } - iph = skb->nh.iph; - th = skb->h.th; - size = skb->tail - skb->h.raw; + return 0; - if (want_rewrite) { - __u32 new_saddr = rt->rt_src; +do_rewrite: + new_saddr = rt->rt_src; - /* - * Ouch!, this should not happen. - */ - if (!sk->saddr || !sk->rcv_saddr) { - printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n", - ntohl(sk->saddr), - ntohl(sk->rcv_saddr)); - return 0; - } - - /* - * Maybe whe are in a skb chain loop and socket address has - * yet been 'damaged'. - */ - - if (new_saddr != sk->saddr) { - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(sk->saddr), - NIPQUAD(new_saddr)); - } + /* Ouch!, this should not happen. */ + if (!sk->saddr || !sk->rcv_saddr) { + printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: " + "saddr=%08lX rcv_saddr=%08lX\n", + ntohl(sk->saddr), + ntohl(sk->rcv_saddr)); + return 0; + } - sk->saddr = new_saddr; - sk->rcv_saddr = new_saddr; - /* sk->prot->rehash(sk); */ - tcp_v4_rehash(sk); - } - - if (new_saddr != iph->saddr) { - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(new_saddr)); - } + if (new_saddr != sk->saddr) { + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr " + "from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(sk->saddr), + NIPQUAD(new_saddr)); + } - iph->saddr = new_saddr; - ip_send_check(iph); - } + sk->saddr = new_saddr; + sk->rcv_saddr = new_saddr; + tcp_v4_rehash(sk); + } - } - return 0; } @@ -1792,11 +1629,10 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin->sin_family = AF_INET; sin->sin_addr.s_addr = sk->daddr; - sin->sin_port = sk->dummy_th.dest; + sin->sin_port = sk->dport; } struct tcp_func ipv4_specific = { - tcp_v4_build_header, ip_queue_xmit, tcp_v4_send_check, tcp_v4_rebuild_header, @@ -1835,10 +1671,6 @@ static int tcp_v4_init_sock(struct sock *sk) sk->mtu = 576; sk->mss = 536; - /* Speed up by setting some standard state for the dummy_th. */ - sk->dummy_th.ack=1; - sk->dummy_th.doff=sizeof(struct tcphdr)>>2; - /* Init SYN queue. */ tcp_synq_init(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d8c3c6480..465ee3fdc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.65 1998/03/15 12:07:03 davem Exp $ + * Version: $Id: tcp_output.c,v 1.76 1998/03/22 22:10:24 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -29,6 +29,7 @@ * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. * */ @@ -57,278 +58,227 @@ static __inline__ void update_send_head(struct sock *sk) tp->send_head = NULL; } -/* - * This is the main buffer sending routine. We queue the buffer - * having checked it is sane seeming. +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. */ - -void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int size; + if(skb != NULL) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; - /* Length of packet (not counting length of pre-tcp headers). */ - size = skb->len - ((unsigned char *) th - skb->data); + if(tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + if(sysctl_tcp_window_scaling) + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + if(sysctl_tcp_sack && !sysctl_tcp_timestamps) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } else if(tp->sack_ok && tp->num_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + } + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = sk->sport; + th->dest = sk->dport; + th->seq = htonl(skb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + th->doff = (tcp_header_size >> 2); + th->res1 = 0; + *(((__u8 *)th) + 13) = tcb->flags; + th->window = htons(tcp_select_window(sk)); + th->check = 0; + th->urg_ptr = ntohs(tcb->urg_ptr); + if(tcb->flags & TCPCB_FLAG_SYN) { + th->window = htons(tp->rcv_wnd); + tcp_syn_build_options((__u32 *)(th + 1), sk->mss, + sysctl_tcp_timestamps, + sysctl_tcp_sack, + sysctl_tcp_window_scaling, + tp->rcv_wscale, + skb->when); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, skb->when); + } + tp->af_specific->send_check(sk, th, skb->len, skb); - /* If there is a FIN or a SYN we add it onto the size. */ - if (th->fin || th->syn) { - if(th->syn) - size++; - if(th->fin) - size++; + clear_delayed_acks(sk); + tp->last_ack_sent = tp->rcv_nxt; + tcp_statistics.TcpOutSegs++; + tp->af_specific->queue_xmit(skb); } +} - /* Actual processing. */ - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + size - 4*th->doff; +/* This is the main buffer sending routine. We queue the buffer + * and decide whether to queue or transmit now. + */ +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq += (skb->end_seq - skb->seq); skb_queue_tail(&sk->write_queue, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { - struct sk_buff * buff; - - /* This is going straight out. */ - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = htons(tcp_select_window(sk)); - tcp_update_options((__u32 *)(th + 1),tp); - - tp->af_specific->send_check(sk, th, size, skb); - - buff = skb_clone(skb, GFP_KERNEL); - if (buff == NULL) - goto queue; - - clear_delayed_acks(sk); - skb_set_owner_w(buff, sk); - + /* Send it out now. */ + skb->when = jiffies; tp->snd_nxt = skb->end_seq; tp->packets_out++; - - skb->when = jiffies; - - tcp_statistics.TcpOutSegs++; - tp->af_specific->queue_xmit(buff); - - if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - return; - } - -queue: - /* Remember where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; - if (!force_queue && tp->packets_out == 0 && !tp->pending) { - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } else { + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; + if (!force_queue && tp->packets_out == 0 && !tp->pending) { + tp->pending = TIME_PROBE0; + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } } } -/* - * Function to create two new tcp segments. - * Shrinks the given segment to the specified size and appends a new - * segment with the rest of the packet to the list. - * This won't be called frenquently, I hope... +/* Function to create two new tcp segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frenquently, I hope... + * Remember, these are still header-less SKB's at this point. */ - static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *buff; - struct tcphdr *th, *nth; - int nsize; - int tmp; - - th = skb->h.th; - - /* Size of new segment. */ - nsize = skb->tail - ((unsigned char *)(th)+tp->tcp_header_len) - len; - if (nsize <= 0) { - printk(KERN_DEBUG "tcp_fragment: bug size <= 0\n"); - return -1; - } + int nsize = skb->len - len; + u16 flags; /* Get a new skb... force flag on. */ - buff = sock_wmalloc(sk, nsize + 128 + sk->prot->max_header + 15, 1, - GFP_ATOMIC); + buff = sock_wmalloc(sk, + (nsize + + MAX_HEADER + + sk->prot->max_header + 15), + 1, GFP_ATOMIC); if (buff == NULL) - return -1; + return -1; /* We'll just try again later. */ - /* Put headers on the new packet. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return -1; - } + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - /* Move the TCP header over. */ - nth = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - buff->h.th = nth; - memcpy(nth, th, tp->tcp_header_len); - - /* Correct the new header. */ + /* Correct the sequence numbers. */ buff->seq = skb->seq + len; buff->end_seq = skb->end_seq; - nth->seq = htonl(buff->seq); - nth->check = 0; - nth->doff = th->doff; - /* urg data is always an headache */ - if (th->urg) { - if (th->urg_ptr > len) { - th->urg = 0; - nth->urg_ptr -= len; + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + if(flags & TCPCB_FLAG_URG) { + u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr; + + /* Urgent data is always a pain in the ass. */ + if(old_urg_ptr > len) { + TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG); + TCP_SKB_CB(skb)->urg_ptr = 0; + TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len; } else { - nth->urg = 0; + flags &= ~(TCPCB_FLAG_URG); } } + if(!(flags & TCPCB_FLAG_URG)) + TCP_SKB_CB(buff)->urg_ptr = 0; + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = 0; - /* Copy data tail to our new buffer. */ - buff->csum = csum_partial_copy(((u8 *)(th)+tp->tcp_header_len) + len, - skb_put(buff, nsize), + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), nsize, 0); skb->end_seq -= nsize; skb_trim(skb, skb->len - nsize); - /* Remember to checksum this packet afterwards. */ - th->check = 0; - skb->csum = csum_partial((u8*)(th) + tp->tcp_header_len, skb->tail - ((u8 *) (th)+tp->tcp_header_len), - 0); + /* Rechecksum original buffer. */ + skb->csum = csum_partial(skb->data, skb->len, 0); + /* Link BUFF into the send queue. */ skb_append(skb, buff); return 0; } -static void tcp_wrxmit_prob(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* This is acked data. We can discard it. This cannot currently occur. */ - tp->retransmits = 0; - - printk(KERN_DEBUG "tcp_write_xmit: bug skb in write queue\n"); - - update_send_head(sk); - - skb_unlink(skb); - kfree_skb(skb); - - if (!sk->dead) - sk->write_space(sk); -} - -static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - SOCK_DEBUG(sk, "tcp_write_xmit: frag needed size=%d mss=%d\n", - size, sk->mss); - - if (tcp_fragment(sk, skb, sk->mss)) { - /* !tcp_frament Failed! */ - tp->send_head = skb; - tp->packets_out--; - return -1; - } - return 0; -} - -/* - * This routine writes packets to the network. - * It advances the send_head. - * This happens as incoming acks open up the remote window for us. +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. */ - void tcp_write_xmit(struct sock *sk) { - struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u16 rcv_wnd; - int sent_pkts = 0; + int mss_now = sk->mss; - /* The bytes will have to remain here. In time closedown will - * empty the write queue and all will be happy. + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. */ - if(sk->zapped) - return; - - /* Anything on the transmit queue that fits the window can - * be added providing we are: - * - * a) following SWS avoidance [and Nagle algorithm] - * b) not exceeding our congestion window. - * c) not retransmiting [Nagle] + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= sk->opt->optlen; + + /* If we are zapped, the bytes will have to remain here. + * In time closedown will empty the write queue and all + * will be happy. */ - rcv_wnd = htons(tcp_select_window(sk)); - while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { - struct tcphdr *th; - struct sk_buff *buff; - int size; + if(!sk->zapped) { + struct sk_buff *skb; + int sent_pkts = 0; - /* See if we really need to send the packet. (debugging code) */ - if (!after(skb->end_seq, tp->snd_una)) { - tcp_wrxmit_prob(sk, skb); - continue; - } - - /* Put in the ack seq and window at this point rather - * than earlier, in order to keep them monotonic. - * We really want to avoid taking back window allocations. - * That's legal, but RFC1122 says it's frowned on. - * Ack and window will in general have changed since - * this packet was put on the write queue. + /* Anything on the transmit queue that fits the window can + * be added providing we are: + * + * a) following SWS avoidance [and Nagle algorithm] + * b) not exceeding our congestion window. + * c) not retransmiting [Nagle] */ - th = skb->h.th; - size = skb->len - (((unsigned char *) th) - skb->data); - if (size - (th->doff << 2) > sk->mss) { - if (tcp_wrxmit_frag(sk, skb, size)) - break; - size = skb->len - (((unsigned char*)th) - skb->data); - } - - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = rcv_wnd; - tcp_update_options((__u32 *)(th + 1),tp); - - tp->af_specific->send_check(sk, th, size, skb); - -#ifdef TCP_DEBUG - if (before(skb->end_seq, tp->snd_nxt)) - printk(KERN_DEBUG "tcp_write_xmit:" - " sending already sent seq\n"); -#endif - - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) - break; - - /* Advance the send_head. This one is going out. */ - update_send_head(sk); - clear_delayed_acks(sk); - - tp->packets_out++; - skb_set_owner_w(buff, sk); - - tp->snd_nxt = skb->end_seq; + while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } - skb->when = jiffies; + /* Advance the send_head. This one is going out. */ + update_send_head(sk); + skb->when = jiffies; + tp->snd_nxt = skb->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + sent_pkts = 1; + } - sent_pkts = 1; - tp->af_specific->queue_xmit(buff); + /* If we sent anything, make sure the retransmit + * timer is active. + */ + if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } - - if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } - - /* This function returns the amount that we can raise the * usable window based on the following constraints * @@ -377,11 +327,7 @@ void tcp_write_xmit(struct sock *sk) * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * - * FIXME: In our current implementation the value returned by sock_rpsace(sk) - * is the total space we have allocated to the socket to store skbuf's. - * The current design assumes that up to half of that space will be - * taken by headers, and the remaining space will be available for TCP data. - * This should be accounted for correctly instead. + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. */ u32 __tcp_select_window(struct sock *sk) { @@ -422,57 +368,72 @@ u32 __tcp_select_window(struct sock *sk) return window; } -static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcphdr *th1, *th2; - int size1, size2, avail; - struct sk_buff *buff = skb->next; - - th1 = skb->h.th; - - if (th1->urg) - return -1; + struct sk_buff *next_skb = skb->next; - avail = skb_tailroom(skb); + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; - /* Size of TCP payload. */ - size1 = skb->tail - ((u8 *) (th1)+(th1->doff<<2)); + /* Punt if the first SKB has URG set. */ + if(flags & TCPCB_FLAG_URG) + return; - th2 = buff->h.th; - size2 = buff->tail - ((u8 *) (th2)+(th2->doff<<2)); + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; - if (size2 > avail || size1 + size2 > sk->mss ) - return -1; + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; - /* Ok. We will be able to collapse the packet. */ - skb_unlink(buff); - memcpy(skb_put(skb, size2), ((char *) th2) + (th2->doff << 2), size2); - - /* Update sizes on original skb, both TCP and IP. */ - skb->end_seq += buff->end_seq - buff->seq; - if (th2->urg) { - th1->urg = 1; - th1->urg_ptr = th2->urg_ptr + size1; - } - if (th2->fin) - th1->fin = 1; + /* Ok. We will be able to collapse the packet. */ + skb_unlink(next_skb); - /* ... and off you go. */ - kfree_skb(buff); - tp->packets_out--; + if(skb->len % 4) { + /* Must copy and rechecksum all data. */ + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb->csum = csum_partial(skb->data, skb->len, 0); + } else { + /* Optimize, actually we could also combine next_skb->csum + * to skb->csum using a single add w/carry operation too. + */ + skb->csum = csum_partial_copy(next_skb->data, + skb_put(skb, next_skb_size), + next_skb_size, skb->csum); + } + + /* Update sequence range on original skb. */ + skb->end_seq += next_skb->end_seq - next_skb->seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + if(flags & TCPCB_FLAG_URG) { + u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr; + TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size; + } + TCP_SKB_CB(skb)->flags = flags; - /* Header checksum will be set by the retransmit procedure - * after calling rebuild header. - */ - th1->check = 0; - skb->csum = csum_partial((u8*)(th1)+(th1->doff<<2), size1 + size2, 0); - return 0; + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + kfree_skb(next_skb); + sk->tp_pinfo.af_tcp.packets_out--; + } } /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used to speed up path mtu recovery. Note that - * these simple retransmit aren't counted in the usual tcp retransmit + * these simple retransmits aren't counted in the usual tcp retransmit * backoff counters. * The socket is already locked here. */ @@ -480,114 +441,114 @@ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Clear delay ack timer. */ - tcp_clear_xmit_timer(sk, TIME_DACK); - - tp->retrans_head = NULL; /* Don't muck with the congestion window here. */ tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; + /* FIXME: make the current rtt sample invalid */ - tcp_do_retransmit(sk, 0); + tp->retrans_head = NULL; + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); } -/* - * A socket has timed out on its send queue and wants to do a - * little retransmitting. - * retrans_head can be different from the head of the write_queue - * if we are doing fast retransmit. - */ +static __inline__ void update_retrans_head(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->retrans_head = tp->retrans_head->next; + if((tp->retrans_head == tp->send_head) || + (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) + tp->retrans_head = NULL; +} -void tcp_do_retransmit(struct sock *sk, int all) +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occured which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - struct sk_buff * skb; - int ct=0; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int current_mss = sk->mss; - if (tp->retrans_head == NULL) - tp->retrans_head = skb_peek(&sk->write_queue); - - if (tp->retrans_head == tp->send_head) - tp->retrans_head = NULL; - - while ((skb = tp->retrans_head) != NULL) { - struct sk_buff *buff; - struct tcphdr *th; - int tcp_size; - int size; - - /* In general it's OK just to use the old packet. However we - * need to use the current ack and window fields. Urg and - * urg_ptr could possibly stand to be updated as well, but we - * don't keep the necessary data. That shouldn't be a problem, - * if the other end is doing the right thing. Since we're - * changing the packet, we have to issue a new IP identifier. - */ + /* Account for outgoing SACKS and IP options, if any. */ + if(tp->sack_ok && tp->num_sacks) + current_mss -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + current_mss -= sk->opt->optlen; - th = skb->h.th; + if(skb->len > current_mss) { + if(tcp_fragment(sk, skb, current_mss)) + return 1; /* We'll try again later. */ - tcp_size = skb->tail - ((unsigned char *)(th)+tp->tcp_header_len); + /* New SKB created, account for it. */ + tp->packets_out++; + } - if (tcp_size > sk->mss) { - if (tcp_fragment(sk, skb, sk->mss)) { - printk(KERN_DEBUG "tcp_fragment failed\n"); - return; - } - tp->packets_out++; - } + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (current_mss >> 1)) && + (skb->next != tp->send_head) && + (skb->next != (struct sk_buff *)&sk->write_queue)) + tcp_retrans_try_collapse(sk, skb, current_mss); - if (!th->syn && - tcp_size < (sk->mss >> 1) && - skb->next != tp->send_head && - skb->next != (struct sk_buff *)&sk->write_queue) - tcp_retrans_try_collapse(sk, skb); - - if (tp->af_specific->rebuild_header(sk, skb)) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "tcp_do_rebuild_header failed\n"); -#endif - break; - } + if(tp->af_specific->rebuild_header(sk)) + return 1; /* Routing failure or similar. */ - SOCK_DEBUG(sk, "retransmit sending seq=%x\n", skb->seq); + /* Ok, we're gonna send it out, update state. */ + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; - /* Update ack and window. */ - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); - th->window = ntohs(tcp_select_window(sk)); - tcp_update_options((__u32 *)(th+1),tp); + /* Make a copy, if the first transmission SKB clone we made + * is still in somebodies hands, else make a clone. + */ + skb->when = jiffies; + if(skb_cloned(skb)) + skb = skb_copy(skb, GFP_ATOMIC); + else + skb = skb_clone(skb, GFP_ATOMIC); + tcp_transmit_skb(sk, skb); - size = skb->tail - (unsigned char *) th; - tp->af_specific->send_check(sk, th, size, skb); + /* Update global TCP statistics and return success. */ + sk->prot->retransmits++; + tcp_statistics.TcpRetransSegs++; - skb->when = jiffies; + return 0; +} - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) - break; +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + int ct = 0; - skb_set_owner_w(buff, sk); + if (tp->retrans_head == NULL) + tp->retrans_head = skb_peek(&sk->write_queue); + if (tp->retrans_head == tp->send_head) + tp->retrans_head = NULL; - clear_delayed_acks(sk); - tp->af_specific->queue_xmit(buff); + while ((skb = tp->retrans_head) != NULL) { + /* If it has been ack'd by a SACK block, we don't + * retransmit it. + */ + if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + /* Send it out, punt if error occurred. */ + if(tcp_retransmit_skb(sk, skb)) + break; - /* Count retransmissions. */ - ct++; - sk->prot->retransmits++; - tcp_statistics.TcpRetransSegs++; - - /* Only one retransmit requested. */ - if (!all) - break; - - /* This should cut it off before we send too many packets. */ - if (ct >= tp->snd_cwnd) - break; - - /* Advance the pointer. */ - tp->retrans_head = skb->next; - if ((tp->retrans_head == tp->send_head) || - (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) - tp->retrans_head = NULL; + /* Count retransmissions locally. */ + ct++; + + /* Stop retransmitting if we've hit the congestion + * window limit. + */ + if (ct >= tp->snd_cwnd) + break; + } + update_retrans_head(sk); } } @@ -597,83 +558,44 @@ void tcp_do_retransmit(struct sock *sk, int all) void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek_tail(&sk->write_queue); + int mss_now = sk->mss; /* Optimization, tack on the FIN if we have a queue of - * unsent frames. + * unsent frames. But be careful about outgoing SACKS + * and IP options. */ - if(tp->send_head != NULL) { - struct sk_buff *tail = skb_peek_tail(&sk->write_queue); - struct tcphdr *th = tail->h.th; - int data_len; - - /* Unfortunately tcp_write_xmit won't check for going over - * the MSS due to the FIN sequence number, so we have to - * watch out for it here. - */ - data_len = (tail->tail - (((unsigned char *)th)+tp->tcp_header_len)); - if(data_len >= sk->mss) - goto build_new_frame; /* ho hum... */ - - /* tcp_write_xmit() will checksum the header etc. for us. */ - th->fin = 1; - tail->end_seq++; + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= sk->opt->optlen; + if((tp->send_head != NULL) && (skb->len < mss_now)) { + /* tcp_write_xmit() takes care of the rest. */ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + skb->end_seq++; + tp->write_seq++; } else { - struct sk_buff *buff; - struct tcphdr *th; - -build_new_frame: - buff = sock_wmalloc(sk, - (BASE_ACK_SIZE + tp->tcp_header_len + - sizeof(struct sk_buff)), - 1, GFP_KERNEL); - if (buff == NULL) { - /* We can only fail due to low memory situations, not - * due to going over our sndbuf limits (due to the - * force flag passed to sock_wmalloc). So just keep - * trying. We cannot allow this fail. The socket is - * still locked, so we need not check if the connection - * was reset in the meantime etc. - */ - goto build_new_frame; - } - - /* Administrivia. */ - buff->csum = 0; - - /* Put in the IP header and routing stuff. - * - * FIXME: - * We can fail if the interface for the route - * this socket takes goes down right before - * we get here. ANK is there a way to point - * this into a "black hole" route in such a - * case? Ideally, we should still be able to - * queue this and let the retransmit timer - * keep trying until the destination becomes - * reachable once more. -DaveM - */ - if(tp->af_specific->build_net_header(sk, buff) < 0) { - kfree_skb(buff); - goto update_write_seq; - } - th = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - buff->h.th = th; - - memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(tp->write_seq); - th->fin = 1; - tcp_build_options((__u32 *)(th + 1), tp); - - /* This makes sure we do things like abide by the congestion - * window and other constraints which prevent us from sending. - */ - tcp_send_skb(sk, buff, 0); + /* Socket is locked, keep trying until memory is available. */ + do { + skb = sock_wmalloc(sk, + (MAX_HEADER + + sk->prot->max_header), + 1, GFP_KERNEL); + } while (skb == NULL); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ + skb->seq = tp->write_seq; + skb->end_seq = skb->seq + 1; + tcp_send_skb(sk, skb, 0); } -update_write_seq: - /* So that we recognize the ACK coming back for - * this FIN as being legitimate. - */ - tp->write_seq++; } /* We get here when a process closes a file descriptor (either due to @@ -685,109 +607,218 @@ void tcp_send_active_reset(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - struct tcphdr *th; -again: /* NOTE: No TCP options attached and we never retransmit this. */ - skb = sock_wmalloc(sk, (BASE_ACK_SIZE + sizeof(*th)), 1, GFP_KERNEL); - if(skb == NULL) - goto again; + do { + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); + } while(skb == NULL); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); skb->csum = 0; - if(tp->af_specific->build_net_header(sk, skb) < 0) { - kfree_skb(skb); - } else { - th = (struct tcphdr *) skb_put(skb, sizeof(*th)); - memcpy(th, &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(tp->write_seq); - th->rst = 1; - th->doff = sizeof(*th) / 4; - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = htons(tcp_select_window(sk)); - tp->af_specific->send_check(sk, th, sizeof(*th), skb); - tp->af_specific->queue_xmit(skb); - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; - } + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* Send it off. */ + skb->seq = tp->write_seq; + skb->end_seq = skb->seq; + skb->when = jiffies; + tcp_transmit_skb(sk, skb); } /* WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. - * - * XXX When you have time Dave, redo this to use tcp_send_skb() just - * XXX like tcp_send_fin() above now does.... -DaveM */ int tcp_send_synack(struct sock *sk) { - struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff * skb; - struct sk_buff * buff; - struct tcphdr *th; - int tmp; + struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff* skb; - skb = sock_wmalloc(sk, MAX_SYN_SIZE + sizeof(struct sk_buff), 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + 1, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - tmp = tp->af_specific->build_net_header(sk, skb); - if (tmp < 0) { - kfree_skb(skb); - return tmp; + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* SYN eats a sequence byte. */ + skb->seq = tp->snd_una; + skb->end_seq = skb->seq + 1; + skb_queue_tail(&sk->write_queue, skb); + skb->when = jiffies; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return 0; +} + +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req, int mss) +{ + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + + skb->dst = dst_clone(dst); + + if (sk->user_mss) + mss = min(mss, sk->user_mss); + if (req->tstamp_ok) + mss -= TCPOLEN_TSTAMP_ALIGNED; + else + req->mss += TCPOLEN_TSTAMP_ALIGNED; + + /* Don't offer more than they did. + * This way we don't have to memorize who said what. + * FIXME: maybe this should be changed for better performance + * with syncookies. + */ + req->mss = min(mss, req->mss); + if (req->mss < 1) { + printk(KERN_DEBUG "initial req->mss below 1\n"); + req->mss = 1; } - th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); - skb->h.th = th; - memset(th, 0, sizeof(struct tcphdr)); + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; + th->source = sk->sport; + th->dest = req->rmt_port; + skb->seq = req->snt_isn; + skb->end_seq = skb->seq + 1; + th->seq = htonl(skb->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = skb->dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,req->mss, + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + th->window = htons(req->rcv_wnd); - th->source = sk->dummy_th.source; - th->dest = sk->dummy_th.dest; - - skb->seq = tp->snd_una; - skb->end_seq = skb->seq + 1 /* th->syn */ ; - th->seq = ntohl(skb->seq); + skb->when = jiffies; + tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + skb->when); - /* This is a resend of a previous SYN, now with an ACK. - * we must reuse the previously offered window. - */ - th->window = htons(tp->rcv_wnd); + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + tcp_statistics.TcpOutSegs++; + return skb; +} - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); +void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) +{ + struct dst_entry *dst = sk->dst_cache; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - tmp = tcp_syn_build_options(skb, sk->mss, - tp->tstamp_ok, tp->wscale_ok, tp->rcv_wscale); - skb->csum = 0; - th->doff = (sizeof(*th) + tmp)>>2; + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - tp->af_specific->send_check(sk, th, sizeof(*th)+tmp, skb); + if (sk->priority == 0) + sk->priority = dst->priority; - skb_queue_tail(&sk->write_queue, skb); + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; + tp->rcv_nxt = 0; + + sk->err = 0; - buff = skb_clone(skb, GFP_ATOMIC); - if (buff) { - skb_set_owner_w(buff, sk); + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - tp->packets_out++; - skb->when = jiffies; + mss -= tp->tcp_header_len; - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; + if (sk->user_mss) + mss = min(mss, sk->user_mss); - tcp_reset_xmit_timer(sk, TIME_RETRANS, TCP_TIMEOUT_INIT); + if (mss < 1) { + printk(KERN_DEBUG "intial sk->mss below 1\n"); + mss = 1; /* Sanity limit */ } - return 0; + + sk->mss = mss; + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + buff->csum = 0; + buff->seq = tp->write_seq++; + buff->end_seq = tp->write_seq; + tp->snd_nxt = buff->end_seq; + + tp->window_clamp = dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &tp->rcv_wscale); + + /* Ok, now lock the socket before we make it visible to + * the incoming packet engine. + */ + lock_sock(sk); + + /* Socket identity change complete, no longer + * in TCP_CLOSE, so enter ourselves into the + * hash tables. + */ + tcp_set_state(sk,TCP_SYN_SENT); + sk->prot->hash(sk); + + tp->rto = dst->rtt; + tcp_init_xmit_timers(sk); + tp->retransmits = 0; + + /* Send it off. */ + skb_queue_tail(&sk->write_queue, buff); + buff->when = jiffies; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_statistics.TcpActiveOpens++; + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + /* Now, it is safe to release the socket. */ + release_sock(sk); } -/* - * Send out a delayed ack, the caller does the policy checking +/* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ - void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) { unsigned long timeout; @@ -799,169 +830,120 @@ void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) timeout += jiffies; /* Use new timeout only if there wasn't a older one earlier. */ - if ((!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) || - (timeout < tp->delack_timer.expires)) + if (!tp->delack_timer.prev) { tp->delack_timer.expires = timeout; - - add_timer(&tp->delack_timer); + add_timer(&tp->delack_timer); + } else { + if (timeout < tp->delack_timer.expires) + mod_timer(&tp->delack_timer, timeout); + } } - - -/* - * This routine sends an ack and also updates the window. - */ - +/* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { - struct sk_buff *buff; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - struct tcphdr *th; - int tmp; - - if(sk->zapped) - return; /* We have been reset, we may not send again. */ + /* If we have been reset, we may not send again. */ + if(!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; - /* We need to grab some memory, and put together an ack, - * and then put it into the queue to be sent. - */ - buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_ATOMIC); - if (buff == NULL) { - /* Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. */ - tcp_send_delayed_ack(tp, HZ/2); - return; - } - - clear_delayed_acks(sk); - - /* Assemble a suitable TCP frame. */ - buff->csum = 0; + buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); + if (buff == NULL) { + /* Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + */ + tcp_send_delayed_ack(tp, HZ/2); + return; + } - /* Put in the IP header and routing stuff. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return; + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + + /* Send it off, this clears delayed acks for us. */ + buff->seq = buff->end_seq = tp->snd_nxt; + buff->when = jiffies; + tcp_transmit_skb(sk, buff); } - - th = (struct tcphdr *)skb_put(buff,tp->tcp_header_len); - memcpy(th, &sk->dummy_th, sizeof(struct tcphdr)); - - /* Swap the send and the receive. */ - th->window = ntohs(tcp_select_window(sk)); - th->seq = ntohl(tp->snd_nxt); - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - tcp_build_and_update_options((__u32 *)(th + 1), tp); - - /* Fill in the packet and send it. */ - tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff); - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; } -/* - * This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. */ - void tcp_write_wakeup(struct sock *sk) { - struct sk_buff *buff, *skb; - struct tcphdr *t1; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int tmp; - - if (sk->zapped) - return; /* After a valid reset we can send no more. */ - - /* Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] - */ - if ((1 << sk->state) & - ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|TCPF_LAST_ACK|TCPF_CLOSING)) - return; - - if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && (skb=tp->send_head)) { - struct tcphdr *th; - unsigned long win_size; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < skb->end_seq - skb->seq) { - if (tcp_fragment(sk, skb, win_size)) { - printk(KERN_DEBUG "tcp_write_wakeup: " - "fragment failed\n"); - return; - } - } - - th = skb->h.th; - tcp_update_options((__u32 *)(th + 1), tp); - tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, skb); - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) + /* After a valid reset we can send no more. */ + if (!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + if ((1 << sk->state) & + ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_LAST_ACK|TCPF_CLOSING)) return; - skb_set_owner_w(buff, sk); - tp->packets_out++; - - clear_delayed_acks(sk); + if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && + ((skb = tp->send_head) != NULL)) { + unsigned long win_size; - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - skb->when = jiffies; - update_send_head(sk); - tp->snd_nxt = skb->end_seq; - } else { - buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - return; + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if (win_size < skb->end_seq - skb->seq) { + if (tcp_fragment(sk, skb, win_size)) + return; /* Let a retransmit get it. */ + } + update_send_head(sk); + skb->when = jiffies; + tp->snd_nxt = skb->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, + GFP_ATOMIC); + if (skb == NULL) + return; - buff->csum = 0; + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; - /* Put in the IP header and routing stuff. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return; + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + skb->seq = tp->snd_nxt - 1; + skb->end_seq = skb->seq; + skb->when = jiffies; + tcp_transmit_skb(sk, skb); } - - t1 = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - - /* Use a previous sequence. - * This should cause the other end to send an ack. - */ - - t1->seq = htonl(tp->snd_nxt-1); - t1->ack_seq = htonl(tp->rcv_nxt); - t1->window = htons(tcp_select_window(sk)); - tcp_build_and_update_options((__u32 *)(t1 + 1), tp); - - tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff); } - - /* Send it. */ - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; } -/* - * A window probe timeout has occurred. - * If window is not closed send a partial packet - * else a zero probe. +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. */ - void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index fdf8f50ec..54380b07d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,11 +77,6 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if((long)when <= 0) { - printk(KERN_DEBUG "xmit_timer <= 0 - timer:%d when:%lx\n", what, when); - when=HZ/50; - } - switch (what) { case TIME_RETRANS: /* When seting the transmit timer the probe timer @@ -91,24 +86,15 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) */ if(tp->probe_timer.prev) del_timer(&tp->probe_timer); - if(tp->retransmit_timer.prev) - del_timer(&tp->retransmit_timer); - tp->retransmit_timer.expires=jiffies+when; - add_timer(&tp->retransmit_timer); + mod_timer(&tp->retransmit_timer, jiffies+when); break; case TIME_DACK: - if(tp->delack_timer.prev) - del_timer(&tp->delack_timer); - tp->delack_timer.expires=jiffies+when; - add_timer(&tp->delack_timer); + mod_timer(&tp->delack_timer, jiffies+when); break; case TIME_PROBE0: - if(tp->probe_timer.prev) - del_timer(&tp->probe_timer); - tp->probe_timer.expires=jiffies+when; - add_timer(&tp->probe_timer); + mod_timer(&tp->probe_timer, jiffies+when); break; case TIME_WRITE: @@ -150,17 +136,12 @@ static int tcp_write_err(struct sock *sk, int force) return 1; } -/* - * A write timeout has occurred. Process the after effects. BROKEN (badly) - */ - +/* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * Look for a 'soft' timeout. - */ + /* Look for a 'soft' timeout. */ if ((sk->state == TCP_ESTABLISHED && tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { @@ -206,11 +187,10 @@ void tcp_probe_timer(unsigned long data) return; } - /* - * *WARNING* RFC 1122 forbids this - * It doesn't AFAIK, because we kill the retransmit timer -AK - * FIXME: We ought not to do it, Solaris 2.5 actually has fixing - * this behaviour in Solaris down as a bug fix. [AC] + /* *WARNING* RFC 1122 forbids this + * It doesn't AFAIK, because we kill the retransmit timer -AK + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] */ if (tp->probes_out > sysctl_tcp_retries2) { if(sk->err_soft) @@ -226,9 +206,10 @@ void tcp_probe_timer(unsigned long data) /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); } + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); } - - tcp_send_probe0(sk); } static __inline__ int tcp_keepopen_proc(struct sock *sk) @@ -375,6 +356,21 @@ void tcp_retransmit_timer(unsigned long data) /* Clear delay ack timer. */ tcp_clear_xmit_timer(sk, TIME_DACK); + /* RFC 2018, clear all 'sacked' flags in retransmission queue, + * the sender may have dropped out of order frames and we must + * send them out should this timer fire on us. + */ + if(tp->sack_ok) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + TCP_SKB_CB(skb)->sacked = 0; + skb = skb->next; + } + } + /* Retransmission. */ tp->retrans_head = NULL; if (tp->retransmits == 0) { @@ -390,7 +386,7 @@ void tcp_retransmit_timer(unsigned long data) tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - tcp_do_retransmit(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized @@ -407,7 +403,7 @@ void tcp_retransmit_timer(unsigned long data) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; /* FIXME: always same as retransmits? -- erics */ + tp->backoff++; tp->rto = min(tp->rto << 1, 120*HZ); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); @@ -523,18 +519,18 @@ void tcp_sltimer_handler(unsigned long data) void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) { unsigned long now = jiffies; - unsigned long next = 0; unsigned long when; slt->last = now; - + when = now + slt->period; - if (del_timer(&tcp_slow_timer)) - next = tcp_slow_timer.expires; - - if (next && ((long)(next - when) < 0)) - when = next; - - tcp_slow_timer.expires = when; - add_timer(&tcp_slow_timer); + + if (tcp_slow_timer.prev) { + if ((long)(tcp_slow_timer.expires - when) >= 0) { + mod_timer(&tcp_slow_timer, when); + } + } else { + tcp_slow_timer.expires = when; + add_timer(&tcp_slow_timer); + } } diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index 79ae3309e..5c5e5eeb3 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -59,10 +59,8 @@ void net_delete_timer (struct sock *t) void net_reset_timer (struct sock *t, int timeout, unsigned long len) { - net_delete_timer (t); t->timeout = timeout; - t->timer.expires = jiffies+len; - add_timer (&t->timer); + mod_timer(&t->timer, jiffies+len); } /* Now we will only be called whenever we need to do diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6ba50b280..a580b0010 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -315,8 +315,8 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, i continue; score++; } - if(sk->dummy_th.dest) { - if(sk->dummy_th.dest != sport) + if(sk->dport) { + if(sk->dport != sport) continue; score++; } @@ -412,8 +412,8 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) + if(s->dport) { + if(s->dport != rnum) continue; score++; } @@ -453,7 +453,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, if ((s->num != hnum) || (s->dead && (s->state == TCP_CLOSE)) || (s->daddr && s->daddr!=raddr) || - (s->dummy_th.dest != rnum && s->dummy_th.dest != 0) || + (s->dport != rnum && s->dport != 0) || (s->rcv_saddr && s->rcv_saddr != laddr)) continue; break; @@ -644,12 +644,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) return -EOPNOTSUPP; #ifdef CONFIG_IP_TRANSPARENT_PROXY - if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY)) + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY|MSG_NOSIGNAL)) return -EINVAL; if ((msg->msg_flags&MSG_PROXY) && !suser() ) return -EPERM; #else - if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT)) + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) return -EINVAL; #endif @@ -686,7 +686,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (sk->state != TCP_ESTABLISHED) return -EINVAL; ufh.daddr = sk->daddr; - ufh.uh.dest = sk->dummy_th.dest; + ufh.uh.dest = sk->dport; /* BUGGG Khm... And who will validate it? Fixing it fastly... @@ -712,7 +712,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) #endif { ipc.addr = sk->saddr; - ufh.uh.source = sk->dummy_th.source; + ufh.uh.source = sk->sport; } ipc.opt = NULL; @@ -971,7 +971,7 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if(!sk->rcv_saddr) sk->rcv_saddr = rt->rt_src; sk->daddr = rt->rt_dst; - sk->dummy_th.dest = usin->sin_port; + sk->dport = usin->sin_port; sk->state = TCP_ESTABLISHED; if(uh_cache_sk == sk) |