diff options
Diffstat (limited to 'net')
168 files changed, 8549 insertions, 6406 deletions
diff --git a/net/.cvsignore b/net/.cvsignore index b9c8aa2e0..f7cf9ab27 100644 --- a/net/.cvsignore +++ b/net/.cvsignore @@ -7,3 +7,4 @@ tags TAGS *.a *.olb *.o *.obj *.so *.exe *.Z *.elc *.ln .depend +.*.flags diff --git a/net/802/.cvsignore b/net/802/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/802/.cvsignore +++ b/net/802/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/802/Makefile b/net/802/Makefile index cea2410d8..12db50d50 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -17,7 +17,8 @@ endif ifeq ($(CONFIG_LLC),y) SUB_DIRS += transit O_OBJS += llc_sendpdu.o llc_utility.o cl2llc.o -OX_OBJS += llc_macinit.o p8022.o psnap.o p8022tr.o +OX_OBJS += llc_macinit.o +SNAP = y endif ifdef CONFIG_TR @@ -29,15 +30,18 @@ O_OBJS += fddi.o endif ifdef CONFIG_IPX -OX_OBJS += p8022.o psnap.o p8022tr.o + SNAP=y endif ifdef CONFIG_ATALK -ifndef CONFIG_IPX -OX_OBJS += p8022.o psnap.o p8022tr.o + SNAP=y endif + +ifeq ($(SNAP),y) +OX_OBJS += p8022.o psnap.o p8022tr.o endif + include $(TOPDIR)/Rules.make cl2llc.c: cl2llc.pre diff --git a/net/802/fddi.c b/net/802/fddi.c index a282cc386..1c9f7e765 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -97,18 +97,6 @@ int fddi_header(struct sk_buff *skb, struct device *dev, unsigned short type, int fddi_rebuild_header(struct sk_buff *skb) { struct fddihdr *fddi = (struct fddihdr *)skb->data; -#if 0 - struct neighbour *neigh = NULL; - - if (skb->dst) - neigh = skb->dst->neighbour; - - if (neigh) - return neigh->ops->resolve(fddi->daddr, skb); -#endif - /* - * Only ARP/IP is currently supported - */ if (fddi->hdr.llc_snap.ethertype != __constant_htons(ETH_P_IP)) { diff --git a/net/802/llc_macinit.c b/net/802/llc_macinit.c index 198230c36..a51a868f2 100644 --- a/net/802/llc_macinit.c +++ b/net/802/llc_macinit.c @@ -19,7 +19,6 @@ * Started restructuring handlers */ -#include <linux/config.h> #include <linux/module.h> #include <linux/version.h> #include <linux/kernel.h> @@ -136,7 +135,7 @@ int llc_mac_data_indicate(llcptr lp, struct sk_buff *skb) * No auto free for I pdus */ skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } if(lp->llc_callbacks) diff --git a/net/802/llc_sendpdu.c b/net/802/llc_sendpdu.c index 5aeaecfbe..f0c6d116e 100644 --- a/net/802/llc_sendpdu.c +++ b/net/802/llc_sendpdu.c @@ -161,7 +161,6 @@ void llc_sendpdu(llcptr lp, char type, char pf, int data_len, char *pdu_data) } lp->dev->hard_header(skb, lp->dev, ETH_P_802_3, lp->remote_mac, NULL, fl); - skb->arp = 1; skb->dev=lp->dev; dev_queue_xmit(skb); } @@ -213,7 +212,6 @@ void llc_sendipdu(llcptr lp, char type, char pf, struct sk_buff *skb) lp->vs = 0; lp->dev->hard_header(skb, lp->dev, ETH_P_802_3, lp->remote_mac, NULL, skb->len); - skb->arp = 1; ADD_TO_RTQ(skb); /* add skb to the retransmit queue */ tmp=skb_clone(skb, GFP_ATOMIC); if(tmp!=NULL) @@ -284,7 +282,6 @@ int llc_resend_ipdu(llcptr lp, unsigned char ack_nr, unsigned char type, char p) tmp=skb_clone(skb, GFP_ATOMIC); if(tmp!=NULL) { - tmp->arp = 1; tmp->dev = lp->dev; dev_queue_xmit(skb); } @@ -347,7 +344,7 @@ int llc_free_acknowledged_skbs(llcptr lp, unsigned char pdu_ack) fr = (frameptr) (pp->data + lp->dev->hard_header_len); ns_save = fr->i_hdr.ns; - kfree_skb(pp, FREE_WRITE); + kfree_skb(pp); ack_count++; if (ns_save == ack) diff --git a/net/802/p8022.c b/net/802/p8022.c index 1a12f4d60..b4a9b43f9 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -16,7 +16,6 @@ * 4 entries at most). The current demux assumes this. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> @@ -62,7 +61,7 @@ int p8022_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) } skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/802/p8022tr.c b/net/802/p8022tr.c index ef6a4976a..b895c9343 100644 --- a/net/802/p8022tr.c +++ b/net/802/p8022tr.c @@ -8,7 +8,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> @@ -57,7 +56,7 @@ int p8022tr_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) } skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/802/psnap.c b/net/802/psnap.c index 6ce58da35..24e7f2bd0 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -10,7 +10,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> @@ -67,7 +66,7 @@ int snap_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) return proto->rcvfunc(skb, dev, &psnap_packet_type); } skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/802/tr.c b/net/802/tr.c index 07d0e0399..bf6cd83d7 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -50,7 +50,7 @@ struct rif_cache_s { unsigned char addr[TR_ALEN]; unsigned char iface[5]; __u16 rcf; - __u8 rseg[8]; + __u16 rseg[8]; rif_cache next; unsigned long last_used; unsigned char local_ring; @@ -441,7 +441,7 @@ int rif_get_info(char *buffer,char **start, off_t offset, int length, int dummy) int len=0; off_t begin=0; off_t pos=0; - int size,i,j,rcf_len; + int size,i,j,rcf_len,segment,brdgnmb; unsigned long now=jiffies; rif_cache entry; @@ -466,10 +466,18 @@ int rif_get_info(char *buffer,char **start, off_t offset, int length, int dummy) rcf_len = ((ntohs(entry->rcf) & TR_RCF_LEN_MASK)>>8)-2; if (rcf_len) rcf_len >>= 1; - for(j = 0; j < rcf_len; j++) { - len+=size; - pos=begin+len; - size=sprintf(buffer+len," %04X",ntohs(entry->rseg[j])); + for(j = 1; j < rcf_len; j++) { + if(j==1) { + segment=ntohs(entry->rseg[j-1])>>4; + len+=size; + pos=begin+len; + size=sprintf(buffer+len," %03X",segment); + }; + segment=ntohs(entry->rseg[j])>>4; + brdgnmb=ntohs(entry->rseg[j-1])&0x00f; + len+=size; + pos=begin+len; + size=sprintf(buffer+len,"-%01X-%03X",brdgnmb,segment); } len+=size; pos=begin+len; diff --git a/net/802/transit/pdutr.h b/net/802/transit/pdutr.h index 55a65001d..900dc74b8 100644 --- a/net/802/transit/pdutr.h +++ b/net/802/transit/pdutr.h @@ -1,5 +1,5 @@ -/* this file was generated on Thu Dec 5 13:58:11 GMT 1996 */ +/* this file was generated on Thu Jan 8 00:21:19 GMT 1998 */ /* index name #defines: */ diff --git a/net/802/transit/timertr.h b/net/802/transit/timertr.h index 9b9403b5a..43237f180 100644 --- a/net/802/transit/timertr.h +++ b/net/802/transit/timertr.h @@ -1,5 +1,5 @@ -/* this file was generated on Mon Mar 10 22:45:36 GMT 1997 */ +/* this file was generated on Thu Jan 8 00:21:21 GMT 1998 */ /* size of transition table is 898 bytes */ diff --git a/net/Config.in b/net/Config.in index 5a5964e34..b4547e569 100644 --- a/net/Config.in +++ b/net/Config.in @@ -10,12 +10,8 @@ if [ "$CONFIG_NETLINK" = "y" ]; then tristate 'Netlink device emulation' CONFIG_NETLINK_DEV fi bool 'Network firewalls' CONFIG_FIREWALL -if [ "$CONFIG_FIREWALL" = "y" ]; then - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - bool 'Socket Security API Support (EXPERIMENTAL)' CONFIG_NET_SECURITY - fi -fi bool 'Network aliasing' CONFIG_NET_ALIAS +bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then @@ -31,7 +27,7 @@ fi comment ' ' tristate 'The IPX protocol' CONFIG_IPX if [ "$CONFIG_IPX" != "n" ]; then - bool 'Full internal IPX network' CONFIG_IPX_INTERN + source net/ipx/Config.in fi tristate 'Appletalk DDP' CONFIG_ATALK if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then @@ -47,19 +43,15 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # bool 'Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI # fi tristate 'WAN router' CONFIG_WAN_ROUTER + bool 'Fast switching (read help!)' CONFIG_NET_FASTROUTE + bool 'Forwarding between high speed interfaces' CONFIG_NET_HW_FLOWCONTROL bool 'CPU is too slow to handle full bandwidth' CONFIG_CPU_IS_SLOW if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then bool 'QoS and/or fair queueing' CONFIG_NET_SCHED if [ "$CONFIG_NET_SCHED" = "y" ]; then - tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ - tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ - tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ - tristate 'RED queueing discipline' CONFIG_NET_SCH_RED - tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ - tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF - tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO - tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO + source net/sched/Config.in fi + bool 'Network code profiler' CONFIG_NET_PROFILE fi fi endmenu diff --git a/net/appletalk/.cvsignore b/net/appletalk/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/appletalk/.cvsignore +++ b/net/appletalk/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index c02f0d5cb..511c65970 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -96,7 +96,7 @@ static void aarp_expire(struct aarp_entry *a) struct sk_buff *skb; while((skb=skb_dequeue(&a->packet_queue))!=NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); kfree_s(a,sizeof(*a)); } @@ -122,7 +122,6 @@ static void aarp_send_query(struct aarp_entry *a) skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); - skb->arp = 1; skb->dev = dev; /* @@ -181,7 +180,6 @@ static void aarp_send_reply(struct device *dev, struct at_addr *us, struct at_ad skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); - skb->arp = 1; skb->dev = dev; /* @@ -243,7 +241,6 @@ void aarp_send_probe(struct device *dev, struct at_addr *us) skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); - skb->arp = 1; skb->dev = dev; /* @@ -666,7 +663,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(dev->type!=ARPHRD_ETHER) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -676,7 +673,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(!skb_pull(skb,sizeof(*ea))) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -689,7 +686,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(ea->function<AARP_REQUEST || ea->function > AARP_PROBE || ea->hw_len != ETH_ALEN || ea->pa_len != AARP_PA_ALEN || ea->pa_src_zero != 0 || ea->pa_dst_zero != 0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -720,7 +717,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(ifa==NULL) { restore_flags(flags); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 1; } if(ifa->status&ATIF_PROBE) @@ -733,7 +730,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type ifa->status|=ATIF_PROBE_FAIL; restore_flags(flags); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 1; } } @@ -792,7 +789,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type break; } restore_flags(flags); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 1; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 335f17e16..8b724361d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1258,7 +1258,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* Size check */ if(skb->len < sizeof(*ddp)) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return (0); } @@ -1289,7 +1289,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if(skb->len < sizeof(*ddp)) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return (0); } @@ -1300,13 +1300,13 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(ddp->deh_sum && atalk_checksum(ddp, ddp->deh_len) != ddp->deh_sum) { /* Not a valid appletalk frame - dustbin time */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } if(call_in_firewall(AF_APPLETALK, skb->dev, ddp, NULL,&skb)!=FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } @@ -1331,7 +1331,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if (skb->pkt_type != PACKET_HOST || ddp->deh_dnet == 0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } @@ -1340,7 +1340,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if(call_fw_firewall(AF_APPLETALK, skb->dev, ddp, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } @@ -1351,7 +1351,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type rt = atrtr_find(&ta); if(rt == NULL || ddp->deh_hops == DDP_MAXHOPS) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } ddp->deh_hops++; @@ -1389,7 +1389,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */ skb = skb_realloc_headroom(skb, 32); else - skb = skb_unshare(skb, GFP_ATOMIC, FREE_READ); + skb = skb_unshare(skb, GFP_ATOMIC); /* * If the buffer didn't vanish into the lack of @@ -1397,9 +1397,8 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if(skb) { - skb->arp = 1; /* Resolved */ if(aarp_send_ddp(rt->dev, skb, &ta, NULL) == -1) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return (0); @@ -1417,7 +1416,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(sock == NULL) /* But not one of our sockets */ { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } @@ -1462,7 +1461,7 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if(sock_queue_rcv_skb(sock, skb) < 0) { skb->sk = NULL; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } return (0); @@ -1490,7 +1489,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type ap = atalk_find_dev_addr(dev); if(ap == NULL || skb->len < sizeof(struct ddpshdr)) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return (0); } @@ -1621,7 +1620,6 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, return (err); skb->sk = sk; - skb->arp = 1; skb_reserve(skb, ddp_dl->header_length); skb_reserve(skb, dev->hard_header_len); @@ -1652,7 +1650,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); if(err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return (-EFAULT); } @@ -1663,7 +1661,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, if(call_out_firewall(AF_APPLETALK, skb->dev, ddp, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return (-EPERM); } @@ -1681,7 +1679,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, loopback = 1; SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk); if(aarp_send_ddp(dev, skb2, &usat->sat_addr, NULL) == -1) - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); /* else queued/sent above in the aarp queue */ } } @@ -1709,7 +1707,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, } if(aarp_send_ddp(dev, skb, &usat->sat_addr, NULL) == -1) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); /* else queued/sent above in the aarp queue */ } SOCK_DEBUG(sk, "SK %p: Done write (%d).\n", sk, len); diff --git a/net/ax25/.cvsignore b/net/ax25/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/ax25/.cvsignore +++ b/net/ax25/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/ax25/Config.in b/net/ax25/Config.in new file mode 100644 index 000000000..6bf4a9ead --- /dev/null +++ b/net/ax25/Config.in @@ -0,0 +1,32 @@ +# +# Amateur Radio protocols and AX.25 device configuration +# +# 19971130 Now in an own category to make correct compilation of the +# AX.25 stuff easier... +# Joerg Reuter DL1BKE <jreuter@poboxes.com> +# 19980129 Moved to net/ax25/Config.in, sourcing device drivers. + +mainmenu_option next_comment +comment 'Amateur Radio support' +bool 'Amateur Radio support' CONFIG_HAMRADIO + +if [ "$CONFIG_HAMRADIO" != "n" ] ; then + if [ "$CONFIG_NET" != "n" ] ; then + comment 'Packet Radio protocols' + tristate 'Amateur Radio AX.25 Level 2 protocol' CONFIG_AX25 + if [ "$CONFIG_AX25" != "n" ]; then + bool ' AX.25 DAMA Slave support' CONFIG_AX25_DAMA_SLAVE +# bool ' AX.25 DAMA Master support' CONFIG_AX25_DAMA_MASTER + dep_tristate ' Amateur Radio NET/ROM protocol' CONFIG_NETROM $CONFIG_AX25 + dep_tristate ' Amateur Radio X.25 PLP (Rose)' CONFIG_ROSE $CONFIG_AX25 + fi + + if [ "$CONFIG_AX25" != "n" ]; then + source drivers/net/hamradio/Config.in + fi + fi + + source drivers/char/hfmodem/Config.in +fi + +endmenu diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c12b9fd13..3a4196b3f 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -368,7 +368,7 @@ void ax25_send_to_raw(struct sock *sk, struct sk_buff *skb, int proto) return; if (sock_queue_rcv_skb(sk, copy) != 0) - kfree_skb(copy, FREE_READ); + kfree_skb(copy); } sk = sk->next; @@ -418,7 +418,7 @@ void ax25_destroy_socket(ax25_cb *ax25) /* Not static as it's used by the timer skb->sk->protinfo.ax25->state = AX25_STATE_0; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } } @@ -1241,7 +1241,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) /* Now attach up the new socket */ skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); sk->ack_backlog--; newsock->sk = newsk; @@ -1385,7 +1385,7 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct if (sk->type == SOCK_SEQPACKET) { /* Connected mode sockets go via the LAPB machine */ if (sk->state != TCP_ESTABLISHED) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOTCONN; } diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 941a41f99..3844f3964 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -155,7 +155,6 @@ static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char p *p++ = cmd; *p++ = param; - skb->arp = 1; skb->dev = ax25_dev->dev; skb->protocol = htons(ETH_P_AX25); diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index a17109bff..79fef3dcb 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -85,7 +85,7 @@ static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) if (ax25->fragno == 0) { if ((skbn = alloc_skb(AX25_MAX_HEADER_LEN + ax25->fraglen, GFP_ATOMIC)) == NULL) { while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); return 1; } @@ -97,13 +97,13 @@ static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) /* Copy data from the fragments */ while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) { memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); } ax25->fraglen = 0; if (ax25_rx_iframe(ax25, skbn) == 0) - kfree_skb(skbn, FREE_READ); + kfree_skb(skbn); } return 1; @@ -113,7 +113,7 @@ static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) /* First fragment received */ if (*skb->data & AX25_SEG_FIRST) { while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); ax25->fragno = *skb->data & AX25_SEG_REM; skb_pull(skb, 1); /* skip fragno */ ax25->fraglen = skb->len; @@ -149,7 +149,7 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) */ struct sk_buff *skbn = skb_copy(skb, GFP_ATOMIC); if (skbn != NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); skb = skbn; } @@ -234,12 +234,12 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a skb->h.raw = skb->data; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } if (call_in_firewall(PF_AX25, skb->dev, skb->h.raw, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -248,7 +248,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a */ if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -279,7 +279,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a ax25_send_to_raw(raw, skb, skb->data[1]); if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -308,22 +308,22 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a /* Now find a suitable dgram socket */ if ((sk = ax25_find_socket(&dest, &src, SOCK_DGRAM)) != NULL) { if (atomic_read(&sk->rmem_alloc) >= sk->rcvbuf) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } else { /* * Remove the control and PID. */ skb_pull(skb, 2); if (sock_queue_rcv_skb(sk, skb) != 0) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } } else { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } break; default: - kfree_skb(skb, FREE_READ); /* Will scan SOCK_AX25 RAW sockets */ + kfree_skb(skb); /* Will scan SOCK_AX25 RAW sockets */ break; } @@ -336,7 +336,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a * silently ignore them. For now we stay quiet. */ if (ax25_dev->values[AX25_VALUES_CONMODE] == 0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -353,7 +353,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a * do no further work */ if (ax25_process_rx_frame(ax25, skb, type, dama) == 0) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -370,7 +370,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a if ((*skb->data & ~AX25_PF) != AX25_DM && mine) ax25_return_dm(dev, &src, &dest, &dp); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -384,7 +384,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a if (sk != NULL) { if (sk->ack_backlog == sk->max_ack_backlog || (make = ax25_make_new(sk, ax25_dev)) == NULL) { if (mine) ax25_return_dm(dev, &src, &dest, &dp); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -399,13 +399,13 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a sk->ack_backlog++; } else { if (!mine) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } if ((ax25 = ax25_create_cb()) == NULL) { ax25_return_dm(dev, &src, &dest, &dp); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -419,7 +419,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a * Sort out any digipeated paths. */ if (dp.ndigi != 0 && ax25->digipeat == NULL && (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); ax25_destroy_socket(ax25); return 0; } @@ -461,7 +461,7 @@ static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_a if (!sk->dead) sk->data_ready(sk, skb->len); } else { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return 0; @@ -475,7 +475,7 @@ int ax25_kiss_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *p skb->sk = NULL; /* Initially we don't know who it's for */ if ((*skb->data & 0x0F) != 0) { - kfree_skb(skb, FREE_READ); /* Not a KISS data frame */ + kfree_skb(skb); /* Not a KISS data frame */ return 0; } diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index a50822b90..c285b4641 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -140,14 +140,14 @@ int ax25_rebuild_header(struct sk_buff *skb) * gets fixed. */ if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } if (skb->sk != NULL) skb_set_owner_w(ourskb, skb->sk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */ @@ -169,7 +169,7 @@ int ax25_rebuild_header(struct sk_buff *skb) if (route->digipeat != NULL) { if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 787a645de..71eb5cfc3 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -194,7 +194,7 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */ } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } else { skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ } @@ -347,14 +347,14 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) if (skb_headroom(skb) < headroom) { if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) { printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n"); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } if (skb->sk != NULL) skb_set_owner_w(skbn, skb->sk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); skb = skbn; } @@ -376,13 +376,12 @@ void ax25_queue_xmit(struct sk_buff *skb) unsigned char *ptr; if (call_out_firewall(PF_AX25, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } skb->protocol = htons(ETH_P_AX25); skb->dev = ax25_fwd_dev(skb->dev); - skb->arp = 1; ptr = skb_push(skb, 1); *ptr = 0x00; /* KISS */ diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 911b54834..0dedcc88e 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -421,7 +421,7 @@ struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_ if (skb->sk != NULL) skb_set_owner_w(skbn, skb->sk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); skb = skbn; } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 98a977182..948ff4719 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -64,16 +64,16 @@ void ax25_clear_queues(ax25_cb *ax25) struct sk_buff *skb; while ((skb = skb_dequeue(&ax25->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&ax25->reseq_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); while ((skb = skb_dequeue(&ax25->frag_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -91,7 +91,7 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) if (ax25->va != nr) { while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) { skb = skb_dequeue(&ax25->ack_queue); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); ax25->va = (ax25->va + 1) % ax25->modulus; } } diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index 000203aaf..2a88a9716 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -8,7 +8,6 @@ #include <linux/config.h> #include <linux/mm.h> #include <linux/sysctl.h> -#include <linux/config.h> #include <net/ax25.h> static int min_ipdefmode[] = {0}, max_ipdefmode[] = {1}; diff --git a/net/bridge/.cvsignore b/net/bridge/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/bridge/.cvsignore +++ b/net/bridge/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/bridge/br.c b/net/bridge/br.c index b68751dd8..2961ff3c6 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -1000,16 +1000,16 @@ static int hold_timer_expired(int port_no) static int send_config_bpdu(int port_no, Config_bpdu *config_bpdu) { -struct sk_buff *skb; -struct device *dev = port_info[port_no].dev; -int size; -unsigned long flags; -struct ethhdr *eth; + struct sk_buff *skb; + struct device *dev = port_info[port_no].dev; + int size; + struct ethhdr *eth; if (port_info[port_no].state == Disabled) { printk(KERN_DEBUG "send_config_bpdu: port %i not valid\n",port_no); return(-1); - } + } + if (br_stats.flags & BR_DEBUG) printk("send_config_bpdu: "); /* @@ -1017,10 +1017,11 @@ struct ethhdr *eth; */ size = dev->hard_header_len + sizeof(Config_bpdu); skb = alloc_skb(size, GFP_ATOMIC); - if (skb == NULL) { + if (skb == NULL) + { printk(KERN_DEBUG "send_config_bpdu: no skb available\n"); return(-1); - } + } skb->dev = dev; skb->mac.raw = skb->h.raw = skb_put(skb, size); eth = skb->mac.ethernet; @@ -1049,21 +1050,17 @@ struct ethhdr *eth; /* won't get bridged again... */ skb->pkt_bridged = IS_BRIDGED; - skb->arp = 1; /* do not resolve... */ - save_flags(flags); - cli(); - skb_queue_tail(dev->buffs, skb); - restore_flags(flags); + skb->dev=dev; + dev_queue_xmit(skb); return(0); } static int send_tcn_bpdu(int port_no, Tcn_bpdu *bpdu) { -struct sk_buff *skb; -struct device *dev = port_info[port_no].dev; -int size; -unsigned long flags; -struct ethhdr *eth; + struct sk_buff *skb; + struct device *dev = port_info[port_no].dev; + int size; + struct ethhdr *eth; if (port_info[port_no].state == Disabled) { printk(KERN_DEBUG "send_tcn_bpdu: port %i not valid\n",port_no); @@ -1105,11 +1102,8 @@ struct ethhdr *eth; /* mark that we've been here... */ skb->pkt_bridged = IS_BRIDGED; - skb->arp = 1; /* do not resolve... */ - save_flags(flags); - cli(); - skb_queue_tail(dev->buffs, skb); - restore_flags(flags); + skb->dev=dev; + dev_queue_xmit(skb); return(0); } @@ -1199,7 +1193,6 @@ int br_receive_frame(struct sk_buff *skb) /* 3.5 */ port = find_port(skb->dev); - skb->arp = 1; /* Received frame so it is resolved */ skb->h.raw = skb->mac.raw; eth = skb->mac.ethernet; if (br_stats.flags & BR_DEBUG) @@ -1393,7 +1386,7 @@ static int br_learn(struct sk_buff *skb, int port) /* 3.8 */ static int br_drop(struct sk_buff *skb) { - kfree_skb(skb, 0); + kfree_skb(skb); return(1); } @@ -1403,7 +1396,7 @@ static int br_drop(struct sk_buff *skb) static int br_dev_drop(struct sk_buff *skb) { - dev_kfree_skb(skb, 0); + dev_kfree_skb(skb); return(1); } @@ -1519,7 +1512,6 @@ static int br_flood(struct sk_buff *skb, int port) nskb->dev= port_info[i].dev; /* To get here we must have done ARP already, or have a received valid MAC header */ - nskb->arp = 1; /* printk("Flood to port %d\n",i);*/ nskb->h.raw = nskb->data + ETH_HLEN; diff --git a/net/core/.cvsignore b/net/core/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/core/.cvsignore +++ b/net/core/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/core/Makefile b/net/core/Makefile index 2ae776157..fc9dc31c4 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -10,12 +10,16 @@ O_TARGET := core.o O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o rtnetlink.o + neighbour.o rtnetlink.o utils.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o endif +ifdef CONFIG_FILTER +O_OBJS += filter.o +endif + ifdef CONFIG_NET O_OBJS += dev.o dev_mcast.o @@ -26,6 +30,10 @@ endif endif +ifdef CONFIG_NET_PROFILE +OX_OBJS += profile.o +endif + include $(TOPDIR)/Rules.make tar: diff --git a/net/core/datagram.c b/net/core/datagram.c index cd6e95000..cdab70aba 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -153,7 +153,7 @@ no_packet: void skb_free_datagram(struct sock * sk, struct sk_buff *skb) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); release_sock(sk); } @@ -195,12 +195,12 @@ int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, * is only ever holding data ready to receive. */ -unsigned int datagram_poll(struct socket *sock, poll_table *wait) +unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned int mask; - poll_wait(sk->sleep, wait); + poll_wait(file, sk->sleep, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/dev.c b/net/core/dev.c index 8d94f6817..b06d0053e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -48,6 +48,8 @@ * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * */ @@ -75,11 +77,11 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <net/br.h> +#include <net/dst.h> #include <net/pkt_sched.h> +#include <net/profile.h> #include <linux/init.h> -#ifdef CONFIG_KERNELD #include <linux/kerneld.h> -#endif #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> #endif /* CONFIG_NET_RADIO */ @@ -87,6 +89,10 @@ extern int plip_init(void); #endif +NET_PROFILE_DEFINE(dev_queue_xmit) +NET_PROFILE_DEFINE(net_bh) +NET_PROFILE_DEFINE(net_bh_skb) + const char *if_port_text[] = { "unknown", @@ -141,6 +147,13 @@ static struct notifier_block *netdev_chain=NULL; static struct sk_buff_head backlog; +#ifdef CONFIG_NET_FASTROUTE +int netdev_fastroute; +int netdev_fastroute_obstacles; +struct net_fastroute_stats dev_fastroute_stat; +#endif + + /****************************************************************************************** Protocol management and registration routines @@ -162,6 +175,13 @@ int netdev_nit=0; void dev_add_pack(struct packet_type *pt) { int hash; +#ifdef CONFIG_NET_FASTROUTE + /* Hack to detect packet socket */ + if (pt->data) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(pt->dev); + } +#endif if(pt->type==htons(ETH_P_ALL)) { netdev_nit++; @@ -196,6 +216,10 @@ void dev_remove_pack(struct packet_type *pt) if(pt==(*pt1)) { *pt1=pt->next; +#ifdef CONFIG_NET_FASTROUTE + if (pt->data) + netdev_fastroute_obstacles--; +#endif return; } } @@ -296,17 +320,20 @@ struct device *dev_alloc(const char *name, int *err) void dev_load(const char *name) { - if(!dev_get(name)) + if(!dev_get(name) && suser()) request_module(name); } +#else + +extern inline void dev_load(const char *unused){;} + #endif -static int -default_rebuild_header(struct sk_buff *skb) +static int default_rebuild_header(struct sk_buff *skb) { - printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header -- BUG!\n", skb->dev->name); - kfree_skb(skb, FREE_WRITE); + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); return 1; } @@ -370,6 +397,24 @@ int dev_open(struct device *dev) return(ret); } +#ifdef CONFIG_NET_FASTROUTE +void dev_clear_fastroute(struct device *dev) +{ + int i; + + if (dev) { + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release(xchg(dev->fastpath+i, NULL)); + } else { + for (dev = dev_base; dev; dev = dev->next) { + if (dev->accept_fastpath) { + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release(xchg(dev->fastpath+i, NULL)); + } + } + } +} +#endif /* * Completely shutdown an interface. @@ -400,6 +445,9 @@ int dev_close(struct device *dev) */ dev->flags&=~(IFF_UP|IFF_RUNNING); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif /* * Tell people we are going down @@ -488,7 +536,9 @@ void dev_loopback_xmit(struct sk_buff *skb) if (newskb==NULL) return; + newskb->mac.raw = newskb->data; skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; if (newskb->dst==NULL) printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); @@ -500,24 +550,23 @@ int dev_queue_xmit(struct sk_buff *skb) struct device *dev = skb->dev; struct Qdisc *q; - /* - * If the address has not been resolved. Call the device header rebuilder. - * This can cover all protocols and technically not just ARP either. - * - * This call must be moved to protocol layer. - * Now it works only for IPv6 and for IPv4 in - * some unusual curcumstances (eql device). --ANK - */ - - if (!skb->arp && dev->rebuild_header(skb)) - return 0; +#ifdef CONFIG_NET_PROFILE + start_bh_atomic(); + NET_PROFILE_ENTER(dev_queue_xmit); +#endif + start_bh_atomic(); q = dev->qdisc; if (q->enqueue) { - start_bh_atomic(); q->enqueue(skb, q); qdisc_wakeup(dev); end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } @@ -530,18 +579,30 @@ int dev_queue_xmit(struct sk_buff *skb) made by us here. */ if (dev->flags&IFF_UP) { - start_bh_atomic(); if (netdev_nit) dev_queue_xmit_nit(skb,dev); if (dev->hard_start_xmit(skb, dev) == 0) { end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } if (net_ratelimit()) printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); - end_bh_atomic(); } - kfree_skb(skb, FREE_WRITE); + end_bh_atomic(); + + kfree_skb(skb); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } @@ -551,7 +612,74 @@ int dev_queue_xmit(struct sk_buff *skb) =======================================================================*/ int netdev_dropping = 0; +int netdev_max_backlog = 300; atomic_t netdev_rx_dropped; +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif + +#ifdef CONFIG_NET_HW_FLOWCONTROL +int netdev_throttle_events; +static unsigned long netdev_fc_mask = 1; +unsigned long netdev_fc_xoff = 0; + +static struct +{ + void (*stimul)(struct device *); + struct device *dev; +} netdev_fc_slots[32]; + +int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) +{ + int bit = 0; + unsigned long flags; + + save_flags(flags); + cli(); + if (netdev_fc_mask != ~0UL) { + bit = ffz(netdev_fc_mask); + netdev_fc_slots[bit].stimul = stimul; + netdev_fc_slots[bit].dev = dev; + set_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + sti(); + return bit; +} + +void netdev_unregister_fc(int bit) +{ + unsigned long flags; + + save_flags(flags); + cli(); + if (bit > 0) { + netdev_fc_slots[bit].stimul = NULL; + netdev_fc_slots[bit].dev = NULL; + clear_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + sti(); +} + +static void netdev_wakeup(void) +{ + unsigned long xoff; + + cli(); + xoff = netdev_fc_xoff; + netdev_fc_xoff = 0; + netdev_dropping = 0; + netdev_throttle_events++; + while (xoff) { + int i = ffz(~xoff); + xoff &= ~(1<<i); + netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev); + } + sti(); +} +#endif + /* * Receive a packet from a device driver and queue it for the upper @@ -560,42 +688,45 @@ atomic_t netdev_rx_dropped; void netif_rx(struct sk_buff *skb) { +#ifndef CONFIG_CPU_IS_SLOW if(skb->stamp.tv_sec==0) get_fast_time(&skb->stamp); +#else + skb->stamp = xtime; +#endif - /* - * Check that we aren't overdoing things. + /* The code is rearranged so that the path is the most + short when CPU is congested, but is still operating. */ - if (!backlog.qlen) - netdev_dropping = 0; - else if (backlog.qlen > 300) - netdev_dropping = 1; - - if (netdev_dropping) - { - atomic_inc(&netdev_rx_dropped); - kfree_skb(skb, FREE_READ); + if (backlog.qlen <= netdev_max_backlog) { + if (backlog.qlen) { + if (netdev_dropping == 0) { + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); + return; + } +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); return; } - - /* - * Add it to the "backlog" queue. - */ - - skb_queue_tail(&backlog,skb); - - /* - * If any packet arrived, mark it for processing after the - * hardware interrupt returns. - */ - - mark_bh(NET_BH); - return; + netdev_dropping = 1; + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); } #ifdef CONFIG_BRIDGE -static inline void handle_bridge(struct skbuff *skb, unsigned short type) +static inline void handle_bridge(struct sk_buff *skb, unsigned short type) { if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) { @@ -610,7 +741,7 @@ static inline void handle_bridge(struct skbuff *skb, unsigned short type) if(br_receive_frame(skb)) { sti(); - continue; + return; } /* * Pull the MAC header off for the copy going to @@ -622,9 +753,6 @@ static inline void handle_bridge(struct skbuff *skb, unsigned short type) } #endif -#ifdef CONFIG_CPU_IS_SLOW -int net_cpu_congestion; -#endif /* * When we are called the queue is ready to grab, the interrupts are @@ -649,6 +777,7 @@ void net_bh(void) net_cpu_congestion = ave_busy>>8; #endif + NET_PROFILE_ENTER(net_bh); /* * Can we send anything now? We want to clear the * decks for any more sends that get done as we @@ -677,11 +806,9 @@ void net_bh(void) { struct sk_buff * skb = backlog.next; - if (jiffies - start_time > 1) { - /* Give chance to other bottom halves to run */ - mark_bh(NET_BH); - return; - } + /* Give chance to other bottom halves to run */ + if (jiffies - start_time > 1) + goto net_bh_break; /* * We have a packet. Therefore the queue has shrunk @@ -692,14 +819,24 @@ void net_bh(void) #ifdef CONFIG_CPU_IS_SLOW if (ave_busy > 128*16) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&backlog)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); break; } #endif - + +#if 0 + NET_PROFILE_SKB_PASSED(skb, net_bh_skb); +#endif +#ifdef CONFIG_NET_FASTROUTE + if (skb->pkt_type == PACKET_FASTROUTE) { + dev_queue_xmit(skb); + continue; + } +#endif + /* * Fetch the packet protocol ID. */ @@ -726,6 +863,12 @@ void net_bh(void) /* XXX until we figure out every place to modify.. */ skb->h.raw = skb->nh.raw = skb->data; + if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) { + printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol); + kfree_skb(skb); + continue; + } + /* * We got a packet ID. Now loop over the "known protocols" * list. There are two lists. The ptype_all list of taps (normally empty) @@ -784,7 +927,7 @@ void net_bh(void) */ else { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } } /* End of queue loop */ @@ -800,23 +943,36 @@ void net_bh(void) qdisc_run_queues(); #ifdef CONFIG_CPU_IS_SLOW -{ - unsigned long start_idle = jiffies; - ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); - start_busy = 0; -} + if (1) { + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; + } +#endif +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; #endif + NET_PROFILE_LEAVE(net_bh); + return; + +net_bh_break: + mark_bh(NET_BH); + NET_PROFILE_LEAVE(net_bh); + return; } /* Protocol dependent address dumping routines */ -static int (*gifconf[NPROTO])(struct device *dev, char *bufptr, int len); +static gifconf_func_t * gifconf_list [NPROTO]; -int register_gifconf(int family, int (*func)(struct device *dev, char *bufptr, int len)) +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) { - if (family<0 || family>=NPROTO) + if (family>=NPROTO) return -EINVAL; - gifconf[family] = func; + gifconf_list[family] = gifconf; return 0; } @@ -903,58 +1059,53 @@ static int dev_ifconf(char *arg) struct ifconf ifc; struct device *dev; char *pos; - unsigned int len; - int err; + int len; + int total; + int i; /* * Fetch the caller's info block. */ - err = copy_from_user(&ifc, arg, sizeof(struct ifconf)); - if (err) + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) return -EFAULT; pos = ifc.ifc_buf; - if (pos==NULL) - ifc.ifc_len=0; len = ifc.ifc_len; /* * Loop over the interfaces, and write an info block for each. */ + total = 0; for (dev = dev_base; dev != NULL; dev = dev->next) { - int i; for (i=0; i<NPROTO; i++) { - int done; - - if (gifconf[i] == NULL) - continue; - - done = gifconf[i](dev, pos, len); - - if (done<0) - return -EFAULT; - - len -= done; - if (pos) - pos += done; + if (gifconf_list[i]) { + int done; + if (pos==NULL) { + done = gifconf_list[i](dev, NULL, 0); + } else { + done = gifconf_list[i](dev, pos+total, len-total); + } + if (done<0) + return -EFAULT; + total += done; + } } } /* * All done. Write the updated control block back to the caller. */ - ifc.ifc_len -= len; + ifc.ifc_len = total; if (copy_to_user(arg, &ifc, sizeof(struct ifconf))) return -EFAULT; - /* - * Report how much was filled in + /* + * Both BSD and Solaris return 0 here, so we do too. */ - - return ifc.ifc_len; + return 0; } /* @@ -1006,7 +1157,7 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy size = sprintf(buffer, "Inter-| Receive | Transmit\n" - " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier\n"); + " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier multicast\n"); pos+=size; len+=size; @@ -1033,6 +1184,41 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy len=length; /* Ending slop */ return len; } + +static int dev_proc_stats(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x %08x %08x %08x\n", + atomic_read(&netdev_rx_dropped), +#ifdef CONFIG_NET_HW_FLOWCONTROL + netdev_throttle_events, +#else + 0, +#endif +#ifdef CONFIG_NET_FASTROUTE + dev_fastroute_stat.hits, + dev_fastroute_stat.succeed, + dev_fastroute_stat.deferred +#else + 0, 0, 0 +#endif + ); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} + #endif /* CONFIG_PROC_FS */ @@ -1125,9 +1311,16 @@ void dev_set_promiscuity(struct device *dev, int inc) if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; if (dev->flags^old_flags) { +#ifdef CONFIG_NET_FASTROUTE + if (dev->flags&IFF_PROMISC) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(dev); + } else + netdev_fastroute_obstacles--; +#endif dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "leaved"); + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); } } @@ -1305,6 +1498,16 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ifr->ifr_ifindex = dev->ifindex; return 0; + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if(ifr->ifr_qlen<2 || ifr->ifr_qlen>1024) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + /* * Unknown or private ioctl */ @@ -1339,9 +1542,7 @@ int dev_ioctl(unsigned int cmd, void *arg) { struct ifreq ifr; int ret; -#ifdef CONFIG_NET_ALIAS char *colon; -#endif /* One special case: SIOCGIFCONF takes ifconf argument and requires shared lock, because it sleeps writing @@ -1350,9 +1551,9 @@ int dev_ioctl(unsigned int cmd, void *arg) if (cmd == SIOCGIFCONF) { rtnl_shlock(); - dev_ifconf((char *) arg); + ret = dev_ifconf((char *) arg); rtnl_shunlock(); - return 0; + return ret; } if (cmd == SIOCGIFCOUNT) { return dev_ifcount((unsigned int*)arg); @@ -1366,20 +1567,14 @@ int dev_ioctl(unsigned int cmd, void *arg) ifr.ifr_name[IFNAMSIZ-1] = 0; -#ifdef CONFIG_NET_ALIAS colon = strchr(ifr.ifr_name, ':'); if (colon) *colon = 0; -#endif /* * See which interface the caller is talking about. */ -#ifdef CONFIG_KERNELD - dev_load(ifr.ifr_name); -#endif - switch(cmd) { /* @@ -1396,9 +1591,15 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); ret = dev_ifsioc(&ifr, cmd); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + } return ret; /* @@ -1417,8 +1618,10 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: if (!suser()) return -EPERM; + dev_load(ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); rtnl_unlock(); @@ -1439,6 +1642,7 @@ int dev_ioctl(unsigned int cmd, void *arg) default: if (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15) { + dev_load(ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); rtnl_unlock(); @@ -1448,6 +1652,7 @@ int dev_ioctl(unsigned int cmd, void *arg) } #ifdef CONFIG_NET_RADIO if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + dev_load(ifr.ifr_name); if (IW_IS_SET(cmd)) { if (!suser()) return -EPERM; @@ -1466,7 +1671,7 @@ int dev_ioctl(unsigned int cmd, void *arg) } } -int dev_new_index() +int dev_new_index(void) { static int ifindex; for (;;) { @@ -1534,6 +1739,10 @@ int unregister_netdevice(struct device *dev) if (dev->flags & IFF_UP) dev_close(dev); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -1579,11 +1788,10 @@ extern void sdla_setup(void); extern void dlci_setup(void); extern int dmascc_init(void); extern int sm_init(void); -extern int baycom_ser_fdx_init(void); -extern int baycom_ser_hdx_init(void); -extern int baycom_par_init(void); +extern int baycom_init(void); extern int lapbeth_init(void); extern void arcnet_init(void); +extern void ip_auto_config(void); #ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_dev = { @@ -1649,14 +1857,8 @@ __initfunc(int net_dev_init(void)) #if defined(CONFIG_SDLA) sdla_setup(); #endif -#if defined(CONFIG_BAYCOM_PAR) - baycom_par_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_FDX) - baycom_ser_fdx_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_HDX) - baycom_ser_hdx_init(); +#if defined(CONFIG_BAYCOM) + baycom_init(); #endif #if defined(CONFIG_SOUNDMODEM) sm_init(); @@ -1680,7 +1882,14 @@ __initfunc(int net_dev_init(void)) slhc_install(); #endif - +#ifdef CONFIG_NET_PROFILE + net_profile_init(); + NET_PROFILE_REGISTER(dev_queue_xmit); + NET_PROFILE_REGISTER(net_bh); +#if 0 + NET_PROFILE_REGISTER(net_bh_skb); +#endif +#endif /* * Add the devices. * If the call to dev->init fails, the dev is removed @@ -1711,6 +1920,10 @@ __initfunc(int net_dev_init(void)) #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_dev); + { + struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0); + ent->read_proc = dev_proc_stats; + } #endif #ifdef CONFIG_NET_RADIO @@ -1723,6 +1936,8 @@ __initfunc(int net_dev_init(void)) dev_boot_phase = 0; + dev_mcast_init(); + #ifdef CONFIG_IP_PNP ip_auto_config(); #endif diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index eaa1bd058..a724497e0 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -19,7 +19,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - + +#include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -37,6 +38,8 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> #include <net/ip.h> #include <net/route.h> #include <linux/skbuff.h> @@ -52,6 +55,9 @@ * that a casual user application can add/delete multicasts used by * protocols without doing damage to the protocols when it deletes the * entries. It also helps IP as it tracks overlapping maps. + * + * BUGGGG! IPv6 calls dev_mac_add/delete from BH, it means + * that all the functions in this file are racy. [NOT FIXED] --ANK */ @@ -82,64 +88,81 @@ void dev_mc_upload(struct device *dev) * Delete a device level multicast */ -void dev_mc_delete(struct device *dev, void *addr, int alen, int all) +int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) { - struct dev_mc_list **dmi; + struct dev_mc_list *dmi, **dmip; - for(dmi=&dev->mc_list;*dmi!=NULL;dmi=&(*dmi)->next) - { + for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) { /* * Find the entry we want to delete. The device could * have variable length entries so check these too. */ - if(memcmp((*dmi)->dmi_addr,addr,(*dmi)->dmi_addrlen)==0 && alen==(*dmi)->dmi_addrlen) - { - struct dev_mc_list *tmp= *dmi; - if(--(*dmi)->dmi_users && !all) - return; + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && alen==dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + return -ENOENT; + } + if(--dmi->dmi_users) + return 0; + /* * Last user. So delete the entry. */ - *dmi=(*dmi)->next; + *dmip = dmi->next; dev->mc_count--; - kfree_s(tmp,sizeof(*tmp)); + kfree_s(dmi,sizeof(*dmi)); /* * We have altered the list, so the card * loaded filter is now wrong. Fix it */ dev_mc_upload(dev); - return; + return 0; } } + return -ENOENT; } /* * Add a device level multicast */ -void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) +int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) { struct dev_mc_list *dmi; - for(dmi=dev->mc_list;dmi!=NULL;dmi=dmi->next) - { - if(memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) - { - if(!newonly) - dmi->dmi_users++; - return; + for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + return 0; + } + dmi->dmi_users++; + return 0; } } - dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi),GFP_KERNEL); - if(dmi==NULL) - return; /* GFP_KERNEL so can't happen anyway */ + + /* GFP_ATOMIC!! It is used by IPv6 from interrupt, + when new address arrives. + + Particularly, it means that this part of code is weirdly + racy, and needs numerous *_bh_atomic --ANK + */ + dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + if (dmi==NULL) + return -ENOBUFS; memcpy(dmi->dmi_addr, addr, alen); dmi->dmi_addrlen=alen; dmi->next=dev->mc_list; dmi->dmi_users=1; + dmi->dmi_gusers=glbl ? 1 : 0; dev->mc_list=dmi; dev->mc_count++; dev_mc_upload(dev); + return 0; } /* @@ -148,13 +171,64 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) void dev_mc_discard(struct device *dev) { - while(dev->mc_list!=NULL) - { + while (dev->mc_list!=NULL) { struct dev_mc_list *tmp=dev->mc_list; - dev->mc_list=dev->mc_list->next; - if (tmp->dmi_users) + dev->mc_list=tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); kfree_s(tmp,sizeof(*tmp)); } dev->mc_count=0; } + +#ifdef CONFIG_PROC_FS +static int dev_mc_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0, begin=0; + struct dev_mc_list *m; + int len=0; + struct device *dev; + + for (dev = dev_base; dev; dev = dev->next) { + for (m = dev->mc_list; m; m = m->next) { + int i; + + len += sprintf(buffer+len,"%-4d %-15s %-5d %-5d ", dev->ifindex, dev->name, + m->dmi_users, m->dmi_gusers); + + for (i=0; i<m->dmi_addrlen; i++) + len += sprintf(buffer+len, "%02x", m->dmi_addr[i]); + + len+=sprintf(buffer+len, "\n"); + + pos=begin+len; + if (pos < offset) { + len=0; + begin=pos; + } + if (pos > offset+length) + goto done; + } + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + +__initfunc(void dev_mcast_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/dev_mcast", 0, 0); + ent->read_proc = dev_mc_read_proc; +#endif +} + diff --git a/net/core/dst.c b/net/core/dst.c index 8ebdb0bb5..e94ef2967 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,38 +58,43 @@ static void dst_run_gc(unsigned long dummy) dst_gc_timer_inc += DST_GC_INC; dst_gc_timer.expires = jiffies + dst_gc_timer_expires; #if RT_CACHE_DEBUG >= 2 - printk("dst_total: %d/%d/%d %ld\n", - atomic_read(&dst_total), delayed, - atomic_read(&hh_count), dst_gc_timer_expires); + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); #endif add_timer(&dst_gc_timer); } static int dst_discard(struct sk_buff *skb) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } static int dst_blackhole(struct sk_buff *skb) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } void * dst_alloc(int size, struct dst_ops * ops) { struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } dst = kmalloc(size, GFP_ATOMIC); if (!dst) return NULL; memset(dst, 0, size); dst->ops = ops; - atomic_set(&dst->refcnt, 1); + atomic_set(&dst->refcnt, 0); dst->lastuse = jiffies; dst->input = dst_discard; dst->output = dst_blackhole; atomic_inc(&dst_total); + atomic_inc(&ops->entries); return dst; } @@ -108,3 +113,25 @@ void __dst_free(struct dst_entry * dst) } end_bh_atomic(); } + +void dst_destroy(struct dst_entry * dst) +{ + struct neighbour *neigh = dst->neighbour; + struct hh_cache *hh = dst->hh; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + atomic_dec(&dst_total); + kfree(dst); +} diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 000000000..a60d8f1e5 --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,366 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist <Jay.Schulist@spacs.k12.wi.us> + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#if defined(CONFIG_FILTER) + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/filter.h> + +/* + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(unsigned char *data, int len, struct sock_filter *filter, int flen) +{ + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + int *t; + + /* + * Process array of filter instructions. + */ + + for(pc = 0; pc < flen; pc++) + { + fentry = &filter[pc]; + if(fentry->code & BPF_X) + t=&X; + else + t=&fentry->k; + + switch(fentry->code) + { + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_ADD|BPF_K: + A += *t; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + A -= *t; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + A *= *t; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_DIV|BPF_K: + if(*t == 0) + return (0); + A /= *t; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + A &= *t; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + A |= *t; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + A <<= *t; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + A >>= *t; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; + if(k + sizeof(long) > len) + return (0); + A = ntohl(*(long*)&data[k]); + continue; + + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; + if(k + sizeof(short) > len) + return (0); + A = ntohs(*(short*)&data[k]); + continue; + + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; + if(k >= len) + return (0); + A = data[k]; + continue; + + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + if(k + sizeof(u32) > len) + return (0); + A = ntohl(*(u32 *)&data[k]); + continue; + + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + if(k + sizeof(u16) > len) + return (0); + A = ntohs(*(u16*)&data[k]); + continue; + + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + if(k >= len) + return (0); + A = data[k]; + continue; + + case BPF_LDX|BPF_B|BPF_MSH: + /* + * Hack for BPF to handle TOS etc + */ + k = fentry->k; + if(k >= len) + return (0); + X = (data[fentry->k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + + case BPF_RET|BPF_A: + return ((unsigned int)A); + + case BPF_ST: + mem[fentry->k] = A; + continue; + + case BPF_STX: + mem[fentry->k] = X; + continue; + + + + default: + /* Invalid instruction counts as RET */ + return (0); + } + } + + printk(KERN_ERR "Filter ruleset ran off the end.\n"); + return (0); +} + +/* + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! + */ + +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + /* + * Check the filter code now. + */ + for(pc = 0; pc < flen; pc++) + { + /* + * All jumps are forward as they are not signed + */ + + ftest = &filter[pc]; + if(BPF_CLASS(ftest->code) == BPF_JMP) + { + /* + * But they mustn't jump off the end. + */ + if(BPF_OP(ftest->code) == BPF_JA) + { + if(pc + ftest->k + 1>= (unsigned)flen) + return (-EINVAL); + } + else + { + /* + * For conditionals both must be safe + */ + if(pc + ftest->jt +1 >= flen || pc + ftest->jf +1 >= flen) + return (-EINVAL); + } + } + + /* + * Check that memory operations use valid addresses. + */ + + if(ftest->k <0 || ftest->k >= BPF_MEMWORDS) + { + /* + * But it might not be a memory operation... + */ + + if (BPF_CLASS(ftest->code) == BPF_ST) + return -EINVAL; + if((BPF_CLASS(ftest->code) == BPF_LD) && + (BPF_MODE(ftest->code) == BPF_MEM)) + return (-EINVAL); + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the + * end they _will_ hit this. + */ + + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL; +} + +/* + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. + */ + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sock_filter *fp, *old_filter; + int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if(fprog->filter == NULL || fprog->len == 0 || fsize > BPF_MAXINSNS) + return (-EINVAL); + + if((err = sk_chk_filter(fprog->filter, fprog->len))==0) + { + /* If existing filter, remove it first */ + if(sk->filter) + { + old_filter = sk->filter_data; + kfree_s(old_filter, (sizeof(old_filter) * sk->filter)); + sk->filter_data = NULL; + } + + fp = (struct sock_filter *)kmalloc(fsize, GFP_KERNEL); + if(fp == NULL) + return (-ENOMEM); + + memset(fp,0,sizeof(*fp)); + memcpy(fp, fprog->filter, fsize); /* Copy instructions */ + + sk->filter = fprog->len; /* Number of filter blocks */ + sk->filter_data = fp; /* Filter instructions */ + } + + return (err); +} +#endif /* CONFIG_FILTER */ diff --git a/net/core/firewall.c b/net/core/firewall.c index 44e0709cf..5d685b0d2 100644 --- a/net/core/firewall.c +++ b/net/core/firewall.c @@ -6,7 +6,6 @@ * much hacked by: Alan Cox */ -#include <linux/config.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/firewall.h> diff --git a/net/core/iovec.c b/net/core/iovec.c index bff328b19..18a9a3b5b 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -26,13 +26,7 @@ #include <linux/in6.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <asm/checksum.h> - -extern inline int min(int x, int y) -{ - return x>y?y:x; -} - +#include <net/checksum.h> /* * Verify iovec @@ -44,9 +38,8 @@ extern inline int min(int x, int y) int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) { - int err=0; - int len=0; - int ct; + int size = m->msg_iovlen * sizeof(struct iovec); + int err, ct; if(m->msg_namelen) { @@ -54,7 +47,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) { err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address); if(err<0) - return err; + goto out; } m->msg_name = address; @@ -63,24 +56,26 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) if (m->msg_iovlen > UIO_FASTIOV) { - iov = kmalloc(m->msg_iovlen*sizeof(struct iovec), GFP_KERNEL); + err = -ENOMEM; + iov = kmalloc(size, GFP_KERNEL); if (!iov) - return -ENOMEM; + goto out; } - err = copy_from_user(iov, m->msg_iov, sizeof(struct iovec)*m->msg_iovlen); - if (err) - { - if (m->msg_iovlen > UIO_FASTIOV) - kfree(iov); - return -EFAULT; - } + if (copy_from_user(iov, m->msg_iov, size)) + goto out_free; + m->msg_iov=iov; - for(ct=0;ct<m->msg_iovlen;ct++) - len+=iov[ct].iov_len; + for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) + err += iov[ct].iov_len; +out: + return err; - m->msg_iov=iov; - return len; +out_free: + err = -EFAULT; + if (m->msg_iovlen > UIO_FASTIOV) + kfree(iov); + goto out; } /* @@ -89,15 +84,15 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) { - int err; + int err = -EFAULT; + while(len>0) { if(iov->iov_len) { - int copy = min(iov->iov_len,len); - err = copy_to_user(iov->iov_base,kdata,copy); - if (err) - return err; + int copy = min(iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + goto out; kdata+=copy; len-=copy; iov->iov_len-=copy; @@ -105,7 +100,9 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) } iov++; } - return 0; + err = 0; +out: + return err; } /* @@ -114,17 +111,15 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) { - int err; + int err = -EFAULT; + while(len>0) { if(iov->iov_len) { - int copy=min(len,iov->iov_len); - err = copy_from_user(kdata, iov->iov_base, copy); - if (err) - { - return -EFAULT; - } + int copy = min(len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; len-=copy; kdata+=copy; iov->iov_base+=copy; @@ -132,7 +127,9 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) } iov++; } - return 0; + err = 0; +out: + return err; } @@ -143,28 +140,23 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len) { - int err; + int err = -EFAULT; + while(offset>0) { if (offset > iov->iov_len) { offset -= iov->iov_len; - } else { - u8 *base; - int copy; + u8 *base = iov->iov_base + offset; + int copy = min(len, iov->iov_len - offset); - base = iov->iov_base + offset; - copy = min(len, iov->iov_len - offset); offset = 0; - err = copy_from_user(kdata, base, copy); - if (err) - { - return -EFAULT; - } + if (copy_from_user(kdata, base, copy)) + goto out; len-=copy; kdata+=copy; } @@ -173,17 +165,17 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, while (len>0) { - int copy=min(len, iov->iov_len); - err = copy_from_user(kdata, iov->iov_base, copy); - if (err) - { - return -EFAULT; - } + int copy = min(len, iov->iov_len); + + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; len-=copy; kdata+=copy; iov++; } - return 0; + err = 0; +out: + return err; } /* @@ -206,25 +198,28 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, do { int copy = iov->iov_len - offset; - if (copy >= 0) { + if (copy > 0) { u8 *base = iov->iov_base + offset; /* Normal case (single iov component) is fastly detected */ if (len <= copy) { - *csump = csum_partial_copy_from_user(base, kdata, - len, *csump, &err); - return err; + *csump = csum_and_copy_from_user(base, kdata, + len, *csump, &err); + goto out; } partial_cnt = copy % 4; if (partial_cnt) { copy -= partial_cnt; - err |= copy_from_user(kdata+copy, base+copy, partial_cnt); + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; } - *csump = csum_partial_copy_from_user(base, kdata, - copy, *csump, &err); - + *csump = csum_and_copy_from_user(base, kdata, copy, + *csump, &err); + if (err) + goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; @@ -236,19 +231,11 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, csum = *csump; - while (len>0) + while (len > 0) { u8 *base = iov->iov_base; unsigned int copy = min(len, iov->iov_len); - /* FIXME: more sanity checking is needed here, because - * the iovs are copied from the user. - */ - if (base == NULL) { - printk(KERN_DEBUG "%s: iov too short\n",current->comm); - return -EINVAL; - } - /* There is a remnant from previous iov. */ if (partial_cnt) { @@ -256,23 +243,26 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, /* iov component is too short ... */ if (par_len > copy) { - err |= copy_from_user(kdata, base, copy); + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; base += copy; partial_cnt += copy; - kdata += copy; len -= copy; iov++; if (len) continue; - *csump = csum_partial(kdata-partial_cnt, partial_cnt, csum); - return err; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; } - err |= copy_from_user(kdata, base, par_len); - csum = csum_partial(kdata-partial_cnt, 4, csum); + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; base += par_len; copy -= par_len; len -= par_len; - kdata += par_len; partial_cnt = 0; } @@ -282,18 +272,31 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, if (partial_cnt) { copy -= partial_cnt; - err |= copy_from_user(kdata+copy, base + copy, partial_cnt); + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; } } - if (copy == 0) + /* Why do we want to break?? There may be more to copy ... */ + if (copy == 0) { +if (len > partial_cnt) +printk("csum_iovec: early break? len=%d, partial=%d\n", len, partial_cnt); break; + } - csum = csum_partial_copy_from_user(base, kdata, copy, csum, &err); + csum = csum_and_copy_from_user(base, kdata, copy, csum, &err); + if (err) + goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; } *csump = csum; +out: return err; + +out_fault: + err = -EFAULT; + goto out; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 427189234..3de3743e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1,8 +1,9 @@ /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: - * Pedro Roque <roque@di.fc.ul.pt> + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -10,144 +11,293 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/sched.h> #include <linux/netdevice.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #include <net/neighbour.h> +#include <net/dst.h> +#include <linux/rtnetlink.h> +#define NEIGH_DEBUG 1 -static void neigh_purge_send_q(struct neighbour *neigh); +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK -void neigh_table_init(struct neigh_table *tbl, struct neigh_ops *ops, int size) -{ - int bmemlen; +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif - memset(tbl, 0, sizeof(struct neigh_table)); - - tbl->tbl_size = size; - tbl->neigh_ops = ops; - - /* - * This should only be called on initialization - * And interrupts should be on - */ +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif - bmemlen = size * sizeof(struct neighbour *); - tbl->hash_buckets = kmalloc(bmemlen, GFP_KERNEL); +static int neigh_glbl_allocs; +static struct neigh_table *neigh_tables; - if (tbl->hash_buckets == NULL) - { - panic("unable to initialize neigh_table"); - } +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonbale choice. + */ - memset(tbl->hash_buckets, 0, bmemlen); +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (net_random() % base) + (base>>1); } -struct neighbour *neigh_alloc(int size, struct neigh_ops *ops) + +static int neigh_forced_gc(struct neigh_table *tbl) { - struct neighbour *neigh; - - neigh = kmalloc(size, GFP_ATOMIC); - if (neigh == NULL) - { - return NULL; - } + int shrunk = 0; + int i; + + if (atomic_read(&tbl->lock)) + return 0; - memset(neigh, 0, size); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (atomic_read(&n->refcnt) == 0 && + !(n->nud_state&NUD_PERMANENT)) { + *np = n->next; + n->tbl = NULL; + tbl->entries--; + shrunk = 1; + neigh_destroy(n); + continue; + } + np = &n->next; + } + } - skb_queue_head_init(&neigh->arp_queue); - neigh->ops = ops; - return neigh; + tbl->last_flush = jiffies; + return shrunk; } -void neigh_queue_ins(struct neigh_table *tbl, struct neighbour *neigh) +int neigh_ifdown(struct neigh_table *tbl, struct device *dev) { - struct neighbour *entry, **head; - entry = tbl->request_queue; + int i; - head = &tbl->request_queue; - - for (; entry; entry = entry->next) - { - head = &entry->next; + if (atomic_read(&tbl->lock)) { + NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n"); + return -EBUSY; + } + + start_bh_atomic(); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + n->tbl = NULL; + tbl->entries--; + if (atomic_read(&n->refcnt)) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + if (n->nud_state & NUD_IN_TIMER) + del_timer(&n->timer); + n->parms = &tbl->parms; + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state&NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } else + neigh_destroy(n); + } } - *head = neigh; - neigh->next = neigh->prev = NULL; + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + end_bh_atomic(); + return 0; } -static struct neighbour *neigh_dequeue(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) { - struct neighbour *neigh; + struct neighbour *n; - if ((neigh = tbl->request_queue)) - { - tbl->request_queue = neigh->next; + if (tbl->entries > tbl->gc_thresh1) { + if (creat < 0) + return NULL; + if (tbl->entries > tbl->gc_thresh2 || + jiffies - tbl->last_flush > 5*HZ) { + if (neigh_forced_gc(tbl) == 0 && + tbl->entries > tbl->gc_thresh3) + return NULL; + } } - return neigh; + + n = kmalloc(tbl->entry_size, GFP_ATOMIC); + if (n == NULL) + return NULL; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + n->updated = n->used = jiffies; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = &tbl->parms; + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + tbl->stats.allocs++; + neigh_glbl_allocs++; + return n; } -void neigh_table_ins(struct neigh_table *tbl, struct neighbour *neigh) + +struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) { - unsigned int hash_val; - struct neighbour **head; - - hash_val = tbl->neigh_ops->hash(neigh->primary_key) % tbl->tbl_size; - - neigh->tbl = tbl; - - head = &tbl->hash_buckets[hash_val]; - - if (!(*head)) - { - neigh->next = neigh; - neigh->prev = neigh; + struct neighbour *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>3; + hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && + memcmp(n->primary_key, pkey, key_len) == 0) { + atomic_inc(&n->refcnt); + return n; + } } - else - { - struct neighbour *prev; - struct neighbour *next; - - next = *head; - prev = next->prev; - + if (!creat) + return NULL; + + n = neigh_alloc(tbl, creat); + if (n == NULL) + return NULL; - neigh->next = next; - neigh->prev = prev; - next->prev = neigh; - prev->next = neigh; + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + + /* Protocol specific setup. */ + if (tbl->constructor && tbl->constructor(n) < 0) { + neigh_destroy(n); + return NULL; } - - *head = neigh; + + /* Device specific setup. */ + if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { + neigh_destroy(n); + return NULL; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time<<1); + atomic_set(&n->refcnt, 1); + tbl->entries++; + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->tbl = tbl; + NEIGH_PRINTK2("neigh %p is created.\n", n); + return n; } -struct neighbour * neigh_lookup(struct neigh_table *tbl, void *pkey, - int key_len, struct device *dev) +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) { - struct neighbour *neigh, *head; - unsigned int hash_val; - - hash_val = tbl->neigh_ops->hash(pkey) % tbl->tbl_size; - head = tbl->hash_buckets[hash_val]; + struct pneigh_entry *n; + u32 hash_val; + int key_len = tbl->key_len; - neigh = head; + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; - if (neigh) - { - do { - if (memcmp(neigh->primary_key, pkey, key_len) == 0) - { - if (!dev || dev == neigh->dev) - return neigh; - } - neigh = neigh->next; + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && + (n->dev == dev || !n->dev)) + return n; + } + if (!creat) + return NULL; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (n == NULL) + return NULL; + + memcpy(n->key, pkey, key_len); + n->dev = dev; - } while (neigh != head); + if (tbl->pconstructor && tbl->pconstructor(n)) { + kfree(n); + return NULL; } - return NULL; + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + return 0; + } + } + return -ENOENT; } /* @@ -156,132 +306,991 @@ struct neighbour * neigh_lookup(struct neigh_table *tbl, void *pkey, */ void neigh_destroy(struct neighbour *neigh) { - if (neigh->tbl) - { - printk(KERN_DEBUG "neigh_destroy: neighbour still in table. " - "called from %p\n", __builtin_return_address(0)); + struct hh_cache *hh; + + if (neigh->tbl || atomic_read(&neigh->refcnt)) { + NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: " + "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0)); + return; } - if (neigh->ops->destructor) - { - (neigh->ops->destructor)(neigh); + if (neigh->nud_state&NUD_IN_TIMER) + del_timer(&neigh->timer); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + hh->hh_output = neigh_blackhole; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); } - neigh_purge_send_q(neigh); + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + neigh_glbl_allocs--; kfree(neigh); } -void neigh_unlink(struct neighbour *neigh) +/* Neighbour state is suspicious; + disable fast path. + */ +static void neigh_suspect(struct neighbour *neigh) { - struct neigh_table *tbl; - struct neighbour **head; - unsigned int hash_val; - struct neighbour *next, *prev; - - tbl = neigh->tbl; - neigh->tbl = NULL; + struct hh_cache *hh; - hash_val = neigh->ops->hash(neigh->primary_key) % tbl->tbl_size; + NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh); - head = &tbl->hash_buckets[hash_val]; - tbl->tbl_entries--; + neigh->output = neigh->ops->output; - next = neigh->next; - if (neigh == (*head)) - { - if (next == neigh) - { - *head = NULL; - goto out; - } - *head = next; - } - - prev = neigh->prev; - next->prev = prev; - prev->next = next; - out: - neigh->next = neigh->prev = NULL; + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; } /* - * Must only be called with an exclusive lock and bh disabled - * + Transitions NUD_STALE <-> NUD_REACHABLE do not occur + when fast path is built: we have no timers assotiated with + these states, we do not have time to check state when sending. + neigh_periodic_timer check periodically neigh->confirmed + time and moves NUD_REACHABLE -> NUD_STALE. + + If a routine wants to know TRUE entry state, it calls + neigh_sync before checking state. */ -void ntbl_walk_table(struct neigh_table *tbl, ntbl_examine_t func, - unsigned long filter, int max, void *args) +static void neigh_sync(struct neighbour *n) { + unsigned long now = jiffies; + u8 state = n->nud_state; + + if (state&(NUD_NOARP|NUD_PERMANENT)) + return; + if (state&NUD_REACHABLE) { + if (now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + } else if (state&NUD_VALID) { + if (now - n->confirmed < n->parms->reachable_time) { + if (state&NUD_IN_TIMER) + del_timer(&n->timer); + n->nud_state = NUD_REACHABLE; + neigh_connect(n); + } + } +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table*)arg; + unsigned long now = jiffies; int i; - if (max == 0) - max = tbl->tbl_size; + if (atomic_read(&tbl->lock)) { + tbl->gc_timer.expires = now + 1*HZ; + add_timer(&tbl->gc_timer); + return; + } + + /* + * periodicly recompute ReachableTime from random function + */ + + if (now - tbl->last_rand > 300*HZ) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p=&tbl->parms; p; p = p->next) + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + } + + for (i=0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; - for (i=0; i < max; i++) - { - struct neighbour **head; - struct neighbour *entry; + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + unsigned state = n->nud_state; - head = &tbl->hash_buckets[i]; - entry = *head; + if (state&(NUD_PERMANENT|NUD_IN_TIMER)) + goto next_elt; - if (!entry) - continue; + if ((long)(n->used - n->confirmed) < 0) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 0 && + (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) { + *np = n->next; + n->tbl = NULL; + n->next = NULL; + tbl->entries--; + neigh_destroy(n); + continue; + } + + if (n->nud_state&NUD_REACHABLE && + now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + +next_elt: + np = &n->next; + } + } + + tbl->gc_timer.expires = now + tbl->gc_interval; + add_timer(&tbl->gc_timer); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return p->ucast_probes + p->app_probes + p->mcast_probes; +} + + +/* Called when a timer expires for a neighbour entry. */ - do { - if (entry->flags & (~filter)) - { - int ret; - ret = (*func)(entry, args); +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now = jiffies; + struct neighbour *neigh = (struct neighbour*)arg; + unsigned state = neigh->nud_state; - if (ret) - { - struct neighbour *curp; + if (!(state&NUD_IN_TIMER)) { + NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); + return; + } - curp = entry; - entry = curp->next; + if ((state&NUD_VALID) && + now - neigh->confirmed < neigh->parms->reachable_time) { + neigh->nud_state = NUD_REACHABLE; + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_connect(neigh); + return; + } + if (state == NUD_DELAY) { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->probes = 0; + } + + if (neigh->probes >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + neigh->tbl->stats.res_failed++; + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->ops->error_report(neigh, skb); + skb_queue_purge(&neigh->arp_queue); + return; + } - neigh_unlink(curp); - neigh_destroy(curp); + neigh->probes++; + neigh->timer.expires = now + neigh->parms->retrans_time; + add_timer(&neigh->timer); - if ((*head) == NULL) - break; - continue; + neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + start_bh_atomic(); + if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { + if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { + if (neigh->tbl == NULL) { + NEIGH_PRINTK2("neigh %p used after death.\n", neigh); + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + neigh->probes = neigh->parms->ucast_probes; + neigh->nud_state = NUD_INCOMPLETE; + neigh->timer.expires = jiffies + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb); + } else { + neigh->nud_state = NUD_FAILED; + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + } + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.prev; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); } + __skb_queue_head(&neigh->arp_queue, skb); } - entry = entry->next; + end_bh_atomic(); + return 1; + } + if (neigh->nud_state == NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + } + end_bh_atomic(); + return 0; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct device*, unsigned char*) = + neigh->dev->header_cache_update; - } while (entry != *head); + if (update) { + for (hh=neigh->hh; hh; hh=hh->hh_next) + update(hh, neigh->dev, neigh->ha); } } -void neigh_tbl_run_bh(struct neigh_table *tbl) -{ - if ((tbl->tbl_bh_mask & NT_MASK_QUEUE)) - { - struct neighbour *neigh; - while((neigh = neigh_dequeue(tbl))) - { - neigh_table_ins(tbl, neigh); + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- override==1 allows to override existing lladdr, if it is different. + -- arp==0 means that that the change is administrative. + */ + +int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) +{ + u8 old = neigh->nud_state; + struct device *dev = neigh->dev; + + if (arp && (old&(NUD_NOARP|NUD_PERMANENT))) + return -EPERM; + + if (!(new&NUD_VALID)) { + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + if (old&NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + return 0; + } + + /* Compare new lladdr with cached one */ + if (dev->addr_len == 0) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if (old&NUD_VALID) { + if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0) + lladdr = neigh->ha; + else if (!override) + return -EPERM; } - tbl->tbl_bh_mask &= ~NT_MASK_QUEUE; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + if (!(old&NUD_VALID)) + return -EINVAL; + lladdr = neigh->ha; + } + + neigh_sync(neigh); + old = neigh->nud_state; + if (new&NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + if (old&NUD_VALID) { + if (lladdr == neigh->ha) + if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) + return 0; } + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + neigh->nud_state = new; + if (lladdr != neigh->ha) { + memcpy(neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1); +#ifdef CONFIG_ARPD + if (neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + } + if (new == old) + return 0; + if (new&NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old&NUD_VALID)) { + struct sk_buff *skb; + while ((skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->output(skb); + } + return 0; } -/* - * Purge all linked skb's of the entry. +struct neighbour * neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct device *dev) +{ + struct neighbour *neigh; + + neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, 1, 1); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol) +{ + struct hh_cache *hh = NULL; + struct device *dev = dst->dev; + + for (hh=n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state&NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resoltution is not made yet. */ -static void neigh_purge_send_q(struct neighbour *neigh) +int neigh_compat_output(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (neigh_event_send(neigh, skb) == 0) { + struct device *dev = neigh->dev; + if (dev->hard_header_cache) { + start_bh_atomic(); + if (dst->hh == NULL) + neigh_hh_init(neigh, dst, dst->ops->protocol); + end_bh_atomic(); + } + if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; + } + return 0; + +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL); + kfree_skb(skb); + return -EINVAL; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff*)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo) + tbl->proxy_redo(back); + else + kfree_skb(back); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) { + tbl->proxy_timer.expires = jiffies + sched_next; + add_timer(&tbl->proxy_timer); + } +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + long sched_next = net_random()%p->proxy_delay; + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = 0; + skb->stamp.tv_usec = now + sched_next; + if (del_timer(&tbl->proxy_timer)) { + long tval = tbl->proxy_timer.expires - now; + if (tval < sched_next) + sched_next = tval; + } + tbl->proxy_timer.expires = now + sched_next; + dst_release(skb->dst); + skb->dst = NULL; + __skb_queue_tail(&tbl->proxy_queue, skb); + add_timer(&tbl->proxy_timer); +} + + +struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl) +{ + struct neigh_parms *p; + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup) { + if (dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + } + p->next = tbl->parms.next; + /* ATOMIC_SET */ + tbl->parms.next = p; + } + return p; +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (parms == NULL || parms == &tbl->parms) + return; + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + /* ATOMIC_SET */ + *p = parms->next; +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(parms); +#endif + kfree(parms); + return; + } + } + NEIGH_PRINTK1("neigh_release_parms: not found\n"); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + + tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); + + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time*20; + tbl->next = neigh_tables; + neigh_tables = tbl; +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + start_bh_atomic(); + del_timer(&tbl->gc_timer); + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + if (tbl->entries) + neigh_ifdown(tbl, NULL); + end_bh_atomic(); + if (tbl->entries) + printk(KERN_CRIT "neighbour leakage\n"); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&tbl->parms); +#endif + return 0; +} + +#ifdef CONFIG_RTNETLINK + + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + + if (ndm->ndm_flags&NTF_PROXY) + return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + + if (dev == NULL) + return -EINVAL; + + start_bh_atomic(); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, 1, 0); + neigh_release(n); + } + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + if (ndm->ndm_flags&NTF_PROXY) { + if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1)) + return 0; + return -ENOBUFS; + } + if (dev == NULL) + return -EINVAL; + if (nda[NDA_LLADDR-1] != NULL && + nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) + return -EINVAL; + start_bh_atomic(); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (n) { + if (nlh->nlmsg_flags&NLM_F_EXCL) + err = -EEXIST; + } else if (!(nlh->nlmsg_flags&NLM_F_CREATE)) + err = -ENOENT; + else { + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1); + if (n == NULL) + err = -ENOBUFS; + } + if (err == 0) { + err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL, + ndm->ndm_state, + nlh->nlmsg_flags&NLM_F_REPLACE, 0); + } + neigh_release(n); + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + pid_t pid, u32 seq, int event) +{ + unsigned long now = jiffies; + struct ndmsg *ndm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm)); + ndm = NLMSG_DATA(nlh); + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_state = n->nud_state; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + if (n->nud_state&NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt); + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neighbour *n; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[1]; + s_idx = idx = cb->args[2]; + for (h=0; h <= NEIGH_HASHMASK; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int)); + start_bh_atomic(); + for (n = tbl->hash_buckets[h], idx = 0; n; + n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) { + end_bh_atomic(); + goto done; + } + } + end_bh_atomic(); + } +done: + cb->args[1] = h; + cb->args[2] = idx; + return skb->len; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct neigh_table *tbl; + int family = ((struct rtgenmsg*)NLMSG_DATA(cb->nlh))->rtgen_family; + + s_t = cb->args[0]; + + for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) { + if (t < s_t) continue; + if (family && tbl->family != family) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) { struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; - /* Release the list of `skb' pointers. */ - while ((skb = skb_dequeue(&neigh->arp_queue))) - { - dev_kfree_skb(skb, FREE_WRITE); + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; } - return; + nlh = (struct nlmsghdr*)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); } + +static void neigh_app_notify(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + + + +#endif + + +#endif + +#ifdef CONFIG_SYSCTL + +struct neigh_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[17]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + NULL, + {{NET_NEIGH_MCAST_SOLICIT, "mcast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_UCAST_SOLICIT, "ucast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_APP_SOLICIT, "app_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_RETRANS_TIME, "retrans_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_REACHABLE_TIME, "base_reachable_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_STALE_TIME, "gc_stale_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_UNRES_QLEN, "unres_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_QLEN, "proxy_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_ANYCAST_DELAY, "anycast_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_DELAY, "proxy_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_LOCKTIME, "locktime", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_INTERVAL, "gc_interval", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_THRESH1, "gc_thresh1", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH2, "gc_thresh2", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH3, "gc_thresh3", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{1, "default", NULL, 0, 0555, NULL},{0}}, + {{0, "neigh", NULL, 0, 0555, NULL},{0}}, + {{0, NULL, NULL, 0, 0555, NULL},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} +}; + +int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name) +{ + struct neigh_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + if (dev) { + t->neigh_dev[0].procname = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex+1; + memset(&t->neigh_vars[12], 0, sizeof(ctl_table)); + } else { + t->neigh_vars[12].data = (&p->locktime) + 1; + t->neigh_vars[13].data = (&p->locktime) + 2; + t->neigh_vars[14].data = (&p->locktime) + 3; + t->neigh_vars[15].data = (&p->locktime) + 4; + } + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (t->sysctl_header == NULL) { + kfree(t); + return -ENOBUFS; + } + p->sysctl_table = t; + return 0; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ diff --git a/net/core/profile.c b/net/core/profile.c new file mode 100644 index 000000000..54fc57662 --- /dev/null +++ b/net/core/profile.c @@ -0,0 +1,304 @@ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <net/checksum.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <net/profile.h> + +#ifdef CONFIG_NET_PROFILE + +atomic_t net_profile_active; +struct timeval net_profile_adjust; + +NET_PROFILE_DEFINE(total); + +struct net_profile_slot *net_profile_chain = &net_prof_total; + +#ifdef __alpha__ +__u32 alpha_lo; +long alpha_hi; + +static void alpha_tick(unsigned long); + +static struct timer_list alpha_timer = + { NULL, NULL, 0, 0L, alpha_tick }; + +void alpha_tick(unsigned long dummy) +{ + struct timeval dummy_stamp; + net_profile_stamp(&dummy_stamp); + alpha_timer.expires = jiffies + 4*HZ; + add_timer(&alpha_timer); +} + +#endif + +void net_profile_irq_adjust(struct timeval *entered, struct timeval* leaved) +{ + struct net_profile_slot *s; + + net_profile_sub(entered, leaved); + for (s = net_profile_chain; s; s = s->next) { + if (s->active) + net_profile_add(leaved, &s->irq); + } +} + + +#ifdef CONFIG_PROC_FS +static int profile_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + struct net_profile_slot *s; + + len+= sprintf(buffer, "Slot Hits Hi Lo OnIrqHi OnIrqLo Ufl\n"); + + if (offset == 0) { + cli(); + net_prof_total.active = 1; + atomic_inc(&net_profile_active); + NET_PROFILE_LEAVE(total); + sti(); + } + for (s = net_profile_chain; s; s = s->next) { + struct net_profile_slot tmp; + + cli(); + tmp = *s; + + /* Wrong, but pretty close to truth */ + + s->accumulator.tv_sec = 0; + s->accumulator.tv_usec = 0; + s->irq.tv_sec = 0; + s->irq.tv_usec = 0; + s->hits = 0; + s->underflow = 0; + /* Repair active count, it is possible, only if code has a bug */ + if (s->active) { + s->active = 0; + atomic_dec(&net_profile_active); + } + sti(); + + net_profile_sub(&tmp.irq, &tmp.accumulator); + + len += sprintf(buffer+len,"%-15s %-10d %-10ld %-10lu %-10lu %-10lu %d/%d", + tmp.id, + tmp.hits, + tmp.accumulator.tv_sec, + tmp.accumulator.tv_usec, + tmp.irq.tv_sec, + tmp.irq.tv_usec, + tmp.underflow, tmp.active); + + buffer[len++]='\n'; + + pos=begin+len; + if(pos<offset) { + len=0; + begin=pos; + } + if(pos>offset+length) + goto done; + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } + if (offset == 0) { + cli(); + net_prof_total.active = 0; + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + sti(); + } + return len; +} +#endif + +struct iphdr whitehole_iph; +int whitehole_count; + +static int whitehole_xmit(struct sk_buff *skb, struct device *dev) +{ + struct net_device_stats *stats; + dev_kfree_skb(skb); + stats = (struct net_device_stats *)dev->priv; + stats->tx_packets++; + stats->tx_bytes+=skb->len; + + return 0; +} + +static void whitehole_inject(unsigned long); +int whitehole_init(struct device *dev); + +static struct timer_list whitehole_timer = + { NULL, NULL, 0, 0L, whitehole_inject }; + +static struct device whitehole_dev = { + "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, }; + +static int whitehole_open(struct device *dev) +{ + whitehole_count = 100000; + whitehole_timer.expires = jiffies + 5*HZ; + add_timer(&whitehole_timer); + return 0; +} + +static int whitehole_close(struct device *dev) +{ + del_timer(&whitehole_timer); + return 0; +} + +static void whitehole_inject(unsigned long dummy) +{ + struct net_device_stats *stats = (struct net_device_stats *)whitehole_dev.priv; + extern int netdev_dropping; + + do { + struct iphdr *iph; + struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + if (!skb) + break; + skb_reserve(skb, 32); + iph = (struct iphdr*)skb_put(skb, sizeof(*iph)); + skb->mac.raw = ((u8*)iph) - 14; + memcpy(iph, &whitehole_iph, sizeof(*iph)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = &whitehole_dev; + skb->pkt_type = PACKET_HOST; + stats->rx_packets++; + stats->rx_bytes += skb->len; + netif_rx(skb); + whitehole_count--; + } while (netdev_dropping == 0 && whitehole_count>0); + if (whitehole_count > 0) { + whitehole_timer.expires = jiffies + 1; + add_timer(&whitehole_timer); + } +} + +static struct net_device_stats *whitehole_get_stats(struct device *dev) +{ + struct net_device_stats *stats = (struct net_device_stats *) dev->priv; + return stats; +} + +__initfunc(int whitehole_init(struct device *dev)) +{ + dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); + if (dev->priv == NULL) + return -ENOBUFS; + memset(dev->priv, 0, sizeof(struct net_device_stats)); + dev->get_stats = whitehole_get_stats; + dev->hard_start_xmit = whitehole_xmit; + dev->open = whitehole_open; + dev->stop = whitehole_close; + ether_setup(dev); + dev->tx_queue_len = 0; + dev->flags |= IFF_NOARP; + dev->flags &= ~(IFF_BROADCAST|IFF_MULTICAST); + dev->iflink = 0; + whitehole_iph.ihl = 5; + whitehole_iph.version = 4; + whitehole_iph.ttl = 2; + whitehole_iph.saddr = in_aton("193.233.7.21"); + whitehole_iph.daddr = in_aton("193.233.7.10"); + whitehole_iph.tot_len = htons(20); + whitehole_iph.check = ip_compute_csum((void *)&whitehole_iph, 20); + return 0; +} + +int net_profile_register(struct net_profile_slot *slot) +{ + cli(); + slot->next = net_profile_chain; + net_profile_chain = slot; + sti(); + return 0; +} + +int net_profile_unregister(struct net_profile_slot *slot) +{ + struct net_profile_slot **sp, *s; + + for (sp = &net_profile_chain; (s = *sp) != NULL; sp = &s->next) { + if (s == slot) { + cli(); + *sp = s->next; + sti(); + return 0; + } + } + return -ESRCH; +} + + +__initfunc(int net_profile_init(void)) +{ + int i; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/profile", 0, 0); + ent->read_proc = profile_read_proc; +#endif + + register_netdevice(&whitehole_dev); + + printk("Evaluating net profiler cost ..."); +#if CPU == 586 || CPU == 686 + if (!(boot_cpu_data.x86_capability & 16)) { + panic("Sorry, you CPU does not support tsc. I am dying...\n"); + return -1; + } +#endif + start_bh_atomic(); +#ifdef __alpha__ + alpha_tick(0); +#endif + for (i=0; i<1024; i++) { + NET_PROFILE_ENTER(total); + NET_PROFILE_LEAVE(total); + } + if (net_prof_total.accumulator.tv_sec) { + printk(" too high!\n"); + } else { + net_profile_adjust.tv_usec = net_prof_total.accumulator.tv_usec>>10; + printk("%ld units\n", net_profile_adjust.tv_usec); + } + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + end_bh_atomic(); + return 0; +} + +#endif diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 795e0d062..cf7fe8ff8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -74,65 +74,29 @@ struct rtnetlink_link * rtnetlink_links[NPROTO]; #define _X 2 /* exclusive access to tables required */ #define _G 4 /* GET request */ -static unsigned char rtm_properties[RTM_MAX-RTM_BASE+1] = +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = { - _S|_X, /* RTM_NEWLINK */ - _S|_X, /* RTM_DELLINK */ - _G, /* RTM_GETLINK */ - 0, - - _S|_X, /* RTM_NEWADDR */ - _S|_X, /* RTM_DELADDR */ - _G, /* RTM_GETADDR */ - 0, - - _S|_X, /* RTM_NEWROUTE */ - _S|_X, /* RTM_DELROUTE */ - _G, /* RTM_GETROUTE */ - 0, - - _S|_X, /* RTM_NEWNEIGH */ - _S|_X, /* RTM_DELNEIGH */ - _G, /* RTM_GETNEIGH */ - 0, - - _S|_X, /* RTM_NEWRULE */ - _S|_X, /* RTM_DELRULE */ - _G, /* RTM_GETRULE */ - 0 + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)) }; -static int rtnetlink_get_rta(struct kern_rta *rta, struct rtattr *attr, int attrlen) -{ - void **rta_data = (void**)rta; - - while (RTA_OK(attr, attrlen)) { - int type = attr->rta_type; - if (type != RTA_UNSPEC) { - if (type > RTA_MAX) - return -EINVAL; - rta_data[type-1] = RTA_DATA(attr); - } - attr = RTA_NEXT(attr, attrlen); - } - return 0; -} - -static int rtnetlink_get_ifa(struct kern_ifa *ifa, struct rtattr *attr, int attrlen) +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = { - void **ifa_data = (void**)ifa; - - while (RTA_OK(attr, attrlen)) { - int type = attr->rta_type; - if (type != IFA_UNSPEC) { - if (type > IFA_MAX) - return -EINVAL; - ifa_data[type-1] = RTA_DATA(attr); - } - attr = RTA_NEXT(attr, attrlen); - } - return 0; -} + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX +}; void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { @@ -145,11 +109,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data memcpy(RTA_DATA(rta), data, attrlen); } +#ifdef CONFIG_RTNL_OLD_IFINFO static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, int type, pid_t pid, u32 seq) { struct ifinfomsg *r; struct nlmsghdr *nlh; + unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; @@ -168,11 +134,65 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, r->ifi_qdisc = dev->qdisc_sleeping->handle; if (dev->qdisc_sleeping->ops) strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; return skb->len; nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); return -1; } +#else +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, + int type, pid_t pid, u32 seq) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = ~0U; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + if (1) { + unsigned mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + if (dev->qdisc_sleeping->ops) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { @@ -191,17 +211,48 @@ int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idx<NPROTO; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == AF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0) + continue; + if (skb_tailroom(skb) < 256) + break; + } + cb->family = idx; + + return skb->len; +} + void rtmsg_ifinfo(int type, struct device *dev) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct ifinfomsg)); +#ifdef CONFIG_RTNL_OLD_IFINFO + int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+ + RTA_LENGTH(sizeof(struct net_device_stats))); +#else + int size = NLMSG_GOODSIZE; +#endif skb = alloc_skb(size, GFP_KERNEL); if (!skb) return; if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0) < 0) { - kfree_skb(skb, 0); + kfree_skb(skb); return; } NETLINK_CB(skb).dst_groups = RTMGRP_LINK; @@ -220,47 +271,68 @@ static int rtnetlink_done(struct netlink_callback *cb) extern __inline__ int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { - union { - struct kern_rta rta; - struct kern_ifa ifa; - } u; - struct rtmsg *rtm; - struct ifaddrmsg *ifm; + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + struct rtattr *rta[RTATTR_MAX]; + int exclusive = 0; + int sz_idx, kind; + int min_len; int family; int type; int err; + /* Only requests are handled by kernel now */ if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; + type = nlh->nlmsg_type; + + /* A control message: ignore them */ if (type < RTM_BASE) return 0; + + /* Unknown message: reply with EINVAL */ if (type > RTM_MAX) goto err_inval; + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) return 0; + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family > NPROTO || rtnetlink_links[family] == NULL) { + if (family > NPROTO) { *errp = -EAFNOSUPPORT; return -1; } - if (rtm_properties[type-RTM_BASE]&_S) { - if (NETLINK_CREDS(skb)->uid) { - *errp = -EPERM; - return -1; - } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[AF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && NETLINK_CREDS(skb)->uid) { + *errp = -EPERM; + return -1; } - if (rtm_properties[type-RTM_BASE]&_G && nlh->nlmsg_flags&NLM_F_DUMP) { - if (rtnetlink_links[family][type-RTM_BASE].dumpit == NULL) + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + if (link->dumpit == NULL) + link = &(rtnetlink_links[AF_UNSPEC][type]); + + if (link->dumpit == NULL) goto err_inval; /* Super-user locks all the tables to get atomic snapshot */ if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) atomic_inc(&rtnl_rlockct); if ((*errp = netlink_dump_start(rtnl, skb, nlh, - rtnetlink_links[family][type-RTM_BASE].dumpit, + link->dumpit, rtnetlink_done)) != 0) { if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) atomic_dec(&rtnl_rlockct); @@ -269,59 +341,41 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) skb_pull(skb, NLMSG_ALIGN(nlh->nlmsg_len)); return -1; } - if (rtm_properties[type-RTM_BASE]&_X) { + + if (kind != 2) { if (rtnl_exlock_nowait()) { *errp = 0; return -1; } exclusive = 1; } - - memset(&u, 0, sizeof(u)); - - switch (nlh->nlmsg_type) { - case RTM_NEWROUTE: - case RTM_DELROUTE: - case RTM_GETROUTE: - case RTM_NEWRULE: - case RTM_DELRULE: - case RTM_GETRULE: - rtm = NLMSG_DATA(nlh); - if (nlh->nlmsg_len < sizeof(*rtm)) - goto err_inval; - if (rtm->rtm_optlen && - rtnetlink_get_rta(&u.rta, RTM_RTA(rtm), rtm->rtm_optlen) < 0) - goto err_inval; - break; - - case RTM_NEWADDR: - case RTM_DELADDR: - case RTM_GETADDR: - ifm = NLMSG_DATA(nlh); - if (nlh->nlmsg_len < sizeof(*ifm)) - goto err_inval; + memset(&rta, 0, sizeof(rta)); - if (nlh->nlmsg_len > NLMSG_LENGTH(sizeof(*ifm)) && - rtnetlink_get_ifa(&u.ifa, IFA_RTA(ifm), - nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifm))) < 0) - goto err_inval; - break; - - case RTM_NEWLINK: - case RTM_DELLINK: - case RTM_GETLINK: - case RTM_NEWNEIGH: - case RTM_DELNEIGH: - case RTM_GETNEIGH: - /* Not urgent and even not necessary */ - default: + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } } - if (rtnetlink_links[family][type-RTM_BASE].doit == NULL) + if (link->doit == NULL) + link = &(rtnetlink_links[AF_UNSPEC][type]); + if (link->doit == NULL) goto err_inval; - err = rtnetlink_links[family][type-RTM_BASE].doit(skb, nlh, (void *)&u); + err = link->doit(skb, nlh, (void *)&rta); if (exclusive) rtnl_exunlock(); @@ -390,15 +444,44 @@ static void rtnetlink_rcv(struct sock *sk, int len) if (skb->len) skb_queue_head(&sk->receive_queue, skb); else - kfree_skb(skb, FREE_READ); + kfree_skb(skb); break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } rtnl_shunlock(); } +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { neigh_add, NULL, }, + { neigh_delete, NULL, }, + { NULL, neigh_dump_info, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +}; + + static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct device *dev = ptr; @@ -429,6 +512,8 @@ __initfunc(void rtnetlink_init(void)) if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[AF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[AF_PACKET] = link_rtnetlink_table; } diff --git a/net/core/scm.c b/net/core/scm.c index 5a6d24c40..ac4aefda0 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -17,6 +17,7 @@ #include <linux/major.h> #include <linux/stat.h> #include <linux/socket.h> +#include <linux/file.h> #include <linux/fcntl.h> #include <linux/net.h> #include <linux/interrupt.h> @@ -44,6 +45,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) { + /* N.B. The test for suser should follow the credential check */ if (suser()) return 0; if (creds->pid != current->pid || @@ -58,11 +60,10 @@ static __inline__ int scm_check_creds(struct ucred *creds) static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) { - int num; + int *fdp = (int*)CMSG_DATA(cmsg); struct scm_fp_list *fpl = *fplp; struct file **fpp; - int *fdp = (int*)CMSG_DATA(cmsg); - int i; + int i, num; num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); @@ -86,41 +87,41 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -EINVAL; /* - * Verify the descriptors. + * Verify the descriptors and increment the usage count. */ for (i=0; i< num; i++) { - int fd; - - fd = fdp[i]; - if (fd < 0 || fd >= NR_OPEN) - return -EBADF; - if (current->files->fd[fd]==NULL) + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) return -EBADF; - fpp[i] = current->files->fd[fd]; + *fpp++ = file; + fpl->count++; } - - /* add another reference to these files */ - for (i=0; i< num; i++, fpp++) - (*fpp)->f_count++; - fpl->count += num; - return num; } void __scm_destroy(struct scm_cookie *scm) { - int i; struct scm_fp_list *fpl = scm->fp; + struct file *file; + int i; - if (!fpl) - return; - - for (i=fpl->count-1; i>=0; i--) - close_fp(fpl->fp[i]); + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } - kfree(fpl); + file = scm->file; + if (file) { + scm->sock = NULL; + scm->file = NULL; + fput(file); + } } @@ -133,11 +134,10 @@ extern __inline__ int not_one_bit(unsigned val) int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { - int err; struct cmsghdr *cmsg; struct file *file; - int acc_fd; - unsigned scm_flags=0; + int acc_fd, err; + unsigned int scm_flags=0; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -169,14 +169,19 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) memcpy(&acc_fd, CMSG_DATA(cmsg), sizeof(int)); p->sock = NULL; if (acc_fd != -1) { - if (acc_fd < 0 || acc_fd >= NR_OPEN || - (file=current->files->fd[acc_fd])==NULL) - return -EBADF; - if (!file->f_dentry->d_inode || !file->f_dentry->d_inode->i_sock) - return -ENOTSOCK; + err = -EBADF; + file = fget(acc_fd); + if (!file) + goto error; + p->file = file; + err = -ENOTSOCK; + if (!file->f_dentry->d_inode || + !file->f_dentry->d_inode->i_sock) + goto error; p->sock = &file->f_dentry->d_inode->u.socket_i; + err = -EINVAL; if (p->sock->state != SS_UNCONNECTED) - return -EINVAL; + goto error; } scm_flags |= MSG_SYN; break; @@ -223,14 +228,17 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) cmhdr.cmsg_level = level; cmhdr.cmsg_type = type; cmhdr.cmsg_len = cmlen; - err = copy_to_user(cm, &cmhdr, sizeof cmhdr); - if (!err) - err = copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)); - if (!err) { - cmlen = CMSG_SPACE(len); - msg->msg_control += cmlen; - msg->msg_controllen -= cmlen; - } + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: return err; } @@ -240,21 +248,28 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) int fdmax = (msg->msg_controllen - sizeof(struct cmsghdr))/sizeof(int); int fdnum = scm->fp->count; - int *cmfptr; - int err = 0; - int i; struct file **fp = scm->fp->fp; + int *cmfptr; + int err = 0, i; if (fdnum < fdmax) fdmax = fdnum; for (i=0, cmfptr=(int*)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) { - int new_fd = get_unused_fd(); - if (new_fd < 0) + int new_fd; + err = get_unused_fd(); + if (err < 0) break; - current->files->fd[new_fd] = fp[i]; + new_fd = err; err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + fp[i]->f_count++; + current->files->fd[new_fd] = fp[i]; } if (i > 0) @@ -272,38 +287,30 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) msg->msg_controllen -= cmlen; } } - - if (err) - i = 0; + if (i < fdnum) + msg->msg_flags |= MSG_CTRUNC; /* - * Dump those that don't fit. + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. */ - for ( ; i < fdnum; i++) { - msg->msg_flags |= MSG_CTRUNC; - close_fp(fp[i]); - } - - kfree (scm->fp); - scm->fp = NULL; + __scm_destroy(scm); } struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) { - int i; struct scm_fp_list *new_fpl; + int i; if (!fpl) return NULL; - new_fpl = kmalloc(fpl->count*sizeof(int) + sizeof(*fpl), GFP_KERNEL); - if (!new_fpl) - return NULL; - - memcpy(new_fpl, fpl, fpl->count*sizeof(int) + sizeof(*fpl)); - - for (i=fpl->count-1; i>=0; i--) - fpl->fp[i]->f_count++; + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + memcpy(new_fpl, fpl, sizeof(*fpl)); + for (i=fpl->count-1; i>=0; i--) + fpl->fp[i]->f_count++; + } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6baf37c03..9180b8b54 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,7 +64,6 @@ static atomic_t net_skbcount = ATOMIC_INIT(0); static atomic_t net_allocs = ATOMIC_INIT(0); static atomic_t net_fails = ATOMIC_INIT(0); - extern atomic_t ip_frag_mem; /* @@ -113,23 +112,23 @@ void __kfree_skb(struct sk_buff *skb) * to be a good idea. */ -struct sk_buff *alloc_skb(unsigned int size,int priority) +struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) { struct sk_buff *skb; unsigned char *bptr; int len; - if (in_interrupt() && priority!=GFP_ATOMIC) { + if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { static int count = 0; if (++count < 5) { printk(KERN_ERR "alloc_skb called nonatomically " "from interrupt %p\n", __builtin_return_address(0)); - priority = GFP_ATOMIC; + gfp_mask &= ~__GFP_WAIT; } } /* - * FIXME: We could do with an architecture dependant + * FIXME: We could do with an architecture dependent * 'alignment mask'. */ @@ -144,7 +143,7 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) * Allocate some space */ - bptr = kmalloc(size,priority); + bptr = kmalloc(size,gfp_mask); if (bptr == NULL) { atomic_inc(&net_fails); return NULL; @@ -226,7 +225,7 @@ void kfree_skbmem(struct sk_buff *skb) * Duplicate an sk_buff. The new one is not owned by a socket. */ -struct sk_buff *skb_clone(struct sk_buff *skb, int priority) +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) { struct sk_buff *n; int inbuff = 0; @@ -237,7 +236,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) skb->inclone = SKB_CLONE_ORIG; inbuff = SKB_CLONE_INLINE; } else { - n = kmalloc(sizeof(*n), priority); + n = kmalloc(sizeof(*n), gfp_mask); if (!n) return NULL; } @@ -263,7 +262,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) * This is slower, and copies the whole data area */ -struct sk_buff *skb_copy(struct sk_buff *skb, int priority) +struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) { struct sk_buff *n; unsigned long offset; @@ -272,7 +271,7 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int priority) * Allocate the copy buffer */ - n=alloc_skb(skb->end - skb->head, priority); + n=alloc_skb(skb->end - skb->head, gfp_mask); if(n==NULL) return NULL; @@ -303,7 +302,6 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int priority) n->ack_seq=skb->ack_seq; memcpy(n->cb, skb->cb, sizeof(skb->cb)); n->used=skb->used; - n->arp=skb->arp; n->tries=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; @@ -354,7 +352,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->end_seq=skb->end_seq; n->ack_seq=skb->ack_seq; n->used=skb->used; - n->arp=skb->arp; n->tries=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; @@ -364,13 +361,3 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) return n; } - -struct sk_buff *dev_alloc_skb(unsigned int length) -{ - struct sk_buff *skb; - - skb = alloc_skb(length+16, GFP_ATOMIC); - if (skb) - skb_reserve(skb,16); - return skb; -} diff --git a/net/core/sock.c b/net/core/sock.c index 725474887..6da5f5a0d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -76,6 +76,8 @@ * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * * To Fix: * @@ -122,6 +124,10 @@ #include <net/icmp.h> #include <linux/ipsec.h> +#ifdef CONFIG_FILTER +#include <linux/filter.h> +#endif + #define min(a,b) ((a)<(b)?(a):(b)) /* Run time adjustable parameters. */ @@ -147,6 +153,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, struct linger ling; struct ifreq req; int ret = 0; + +#ifdef CONFIG_FILTER + struct sock_fprog fprog; +#endif /* * Options without arguments @@ -278,48 +288,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; -#ifdef CONFIG_NET_SECURITY - /* - * FIXME: make these error things that are not - * available! - */ - - case SO_SECURITY_AUTHENTICATION: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->authentication=val; - return 0; - } - if(net_families[sock->ops->family]->authentication) - sk->authentication=val; - else - return -EINVAL; - break; - - case SO_SECURITY_ENCRYPTION_TRANSPORT: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->encryption=val; - return 0; - } - if(net_families[sock->ops->family]->encryption) - sk->encryption = val; - else - return -EINVAL; - break; - - case SO_SECURITY_ENCRYPTION_NETWORK: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->encrypt_net=val; - return 0; - } - if(net_families[sock->ops->family]->encrypt_net) - sk->encrypt_net = val; - else - return -EINVAL; - break; -#endif case SO_BINDTODEVICE: /* Bind this socket to a particular device like "eth0", * as specified in an ifreq structure. If the device @@ -330,36 +298,51 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->bound_dev_if = 0; } else { - if (copy_from_user(&req, optval, sizeof(req)) < 0) + if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; /* Remove any cached route for this socket. */ - if (sk->dst_cache) { - ip_rt_put((struct rtable*)sk->dst_cache); - sk->dst_cache = NULL; - } + dst_release(xchg(&sk->dst_cache, NULL)); if (req.ifr_ifrn.ifrn_name[0] == '\0') { sk->bound_dev_if = 0; - } - else { + } else { struct device *dev = dev_get(req.ifr_ifrn.ifrn_name); if (!dev) return -EINVAL; sk->bound_dev_if = dev->ifindex; - if (sk->daddr) { - int ret; - ret = ip_route_output((struct rtable**)&sk->dst_cache, - sk->daddr, sk->saddr, - sk->ip_tos, sk->bound_dev_if); - if (ret) - return ret; - } } } return 0; +#ifdef CONFIG_FILTER + case SO_ATTACH_FILTER: + if(optlen < sizeof(struct sock_fprog)) + return -EINVAL; + + if(copy_from_user(&fprog, optval, sizeof(fprog))) + { + ret = -EFAULT; + break; + } + + ret = sk_attach_filter(&fprog, sk); + break; + + case SO_DETACH_FILTER: + if(sk->filter) + { + fprog.filter = sk->filter_data; + kfree_s(fprog.filter, (sizeof(fprog.filter) * sk->filter)); + sk->filter_data = NULL; + sk->filter = 0; + return 0; + } + else + return -EINVAL; + break; +#endif /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: @@ -470,20 +453,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; goto lenout; -#ifdef CONFIG_NET_SECURITY - - case SO_SECURITY_AUTHENTICATION: - v.val = sk->authentication; - break; - - case SO_SECURITY_ENCRYPTION_TRANSPORT: - v.val = sk->encryption; - break; - - case SO_SECURITY_ENCRYPTION_NETWORK: - v.val = sk->encrypt_net; - break; -#endif default: return(-ENOPROTOOPT); } @@ -589,6 +558,36 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int return NULL; } +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + void *mem = NULL; + /* Always use wmem.. */ + if (atomic_read(&sk->wmem_alloc)+size < sk->sndbuf) { + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->wmem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->wmem_alloc); + } + return mem; +} + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ +#if 1 /* Debug */ + if (atomic_read(&sk->wmem_alloc) < size) { + printk(KERN_DEBUG "sock_kfree_s: mem not accounted.\n"); + return; + } +#endif + kfree_s(mem, size); + atomic_sub(size, &sk->wmem_alloc); + sk->write_space(sk); +} + /* FIXME: this is insane. We are trying suppose to be controlling how * how much space we have for data bytes, not packet headers. @@ -627,7 +626,7 @@ unsigned long sock_wspace(struct sock *sk) if (sk != NULL) { if (sk->shutdown & SEND_SHUTDOWN) return(0); - if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) + if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) return(0); return sk->sndbuf - atomic_read(&sk->wmem_alloc); } @@ -827,7 +826,7 @@ void sklist_destroy_socket(struct sock **list,struct sock *sk) while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); } if(atomic_read(&sk->wmem_alloc) == 0 && @@ -895,7 +894,7 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr, return -EOPNOTSUPP; } -unsigned int sock_no_poll(struct socket *sock, poll_table *pt) +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) { return -EOPNOTSUPP; } @@ -1009,8 +1008,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) init_timer(&sk->timer); sk->allocation = GFP_KERNEL; - sk->rcvbuf = sysctl_rmem_default*2; - sk->sndbuf = sysctl_wmem_default*2; + sk->rcvbuf = sysctl_rmem_default; + sk->sndbuf = sysctl_wmem_default; sk->state = TCP_CLOSE; sk->zapped = 1; sk->socket = sock; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b684fba33..1da2cc152 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,6 +11,11 @@ #ifdef CONFIG_SYSCTL +extern int netdev_max_backlog; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern __u32 sysctl_wmem_default; @@ -34,6 +39,20 @@ ctl_table core_table[] = { {NET_CORE_DESTROY_DELAY, "destroy_delay", &sysctl_core_destroy_delay, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", + &netdev_max_backlog, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_NET_FASTROUTE + {NET_CORE_FASTROUTE, "netdev_fastroute", + &netdev_fastroute, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_CORE_MSG_COST, "message_cost", + &net_msg_cost, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_MSG_BURST, "message_burst", + &net_msg_burst, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, { 0 } }; #endif diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 000000000..415926b8e --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,66 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> + +static unsigned long net_rand_seed = 152L; + +unsigned long net_random(void) +{ + net_rand_seed=net_rand_seed*69069L+1; + return net_rand_seed^jiffies; +} + +void net_srandom(unsigned long entropy) +{ + net_rand_seed ^= entropy; + net_random(); +} + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10*5*HZ; + +/* + * This enforces a rate limit: not more than one kernel message + * every 5secs to make a denial-of-service attack impossible. + * + * All warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + static unsigned long toks = 10*5*HZ; + static unsigned long last_msg; + static int missed; + unsigned long now = jiffies; + + toks += now - xchg(&last_msg, now); + if (toks > net_msg_burst) + toks = net_msg_burst; + if (toks >= net_msg_cost) { + toks -= net_msg_cost; + if (missed) + printk(KERN_WARNING "NET: %d messages suppressed.\n", missed); + missed = 0; + return 1; + } + missed++; + return 0; +} diff --git a/net/ethernet/.cvsignore b/net/ethernet/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/ethernet/.cvsignore +++ b/net/ethernet/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 47417a27a..bce35d484 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -55,6 +55,7 @@ #include <net/arp.h> #include <net/sock.h> #include <net/ipv6.h> +#include <net/ip.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/checksum.h> @@ -120,7 +121,7 @@ int eth_header(struct sk_buff *skb, struct device *dev, unsigned short type, * Anyway, the loopback-device should never use this function... */ - if (dev->flags & IFF_LOOPBACK) + if (dev->flags & (IFF_LOOPBACK|IFF_NOARP)) { memset(eth->h_dest, 0, dev->addr_len); return(dev->hard_header_len); @@ -140,24 +141,16 @@ int eth_header(struct sk_buff *skb, struct device *dev, unsigned short type, * Rebuild the Ethernet MAC header. This is called after an ARP * (or in future other address resolution) has completed on this * sk_buff. We now let ARP fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. */ - + int eth_rebuild_header(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; struct device *dev = skb->dev; - struct neighbour *neigh = NULL; - /* - * Only ARP/IP and NDISC/IPv6 are currently supported - */ - - if (skb->dst) - neigh = skb->dst->neighbour; - - if (neigh) - return neigh->ops->resolve(eth->h_dest, skb); - switch (eth->h_proto) { #ifdef CONFIG_INET @@ -170,11 +163,10 @@ int eth_rebuild_header(struct sk_buff *skb) dev->name, (int)eth->h_proto); memcpy(eth->h_source, dev->dev_addr, dev->addr_len); - return 0; break; } - return 0; + return 0; } @@ -204,9 +196,12 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) /* * This ALLMULTI check should be redundant by 1.4 * so don't forget to remove it. + * + * Seems, you forgot to remove it. All silly devices + * seems to set IFF_PROMISC. */ - else if(dev->flags&(IFF_PROMISC|IFF_ALLMULTI)) + else if(dev->flags&(IFF_PROMISC/*|IFF_ALLMULTI*/)) { if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN)) skb->pkt_type=PACKET_OTHERHOST; @@ -239,38 +234,18 @@ int eth_header_parse(struct sk_buff *skb, unsigned char *haddr) return ETH_ALEN; } -int eth_header_cache(struct dst_entry *dst, struct neighbour *neigh, - struct hh_cache *hh) +int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh) { unsigned short type = hh->hh_type; - struct ethhdr *eth = (struct ethhdr*)hh->hh_data; - struct device *dev = dst->dev; + struct ethhdr *eth = (struct ethhdr*)(((u8*)hh->hh_data) + 2); + struct device *dev = neigh->dev; - if (type == ETH_P_802_3) + if (type == __constant_htons(ETH_P_802_3)) return -1; - - eth->h_proto = htons(type); + eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, dev->addr_len); - - if (dev->flags & IFF_LOOPBACK) { - memset(eth->h_dest, 0, dev->addr_len); - hh->hh_uptodate = 1; - return 0; - } - - if (type != ETH_P_IP) - { - printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",dev->name,(int)eth->h_proto); - hh->hh_uptodate = 0; - return 0; - } - -#ifdef CONFIG_INET - hh->hh_uptodate = arp_find_1(eth->h_dest, dst, neigh); -#else - hh->hh_uptodate = 0; -#endif + memcpy(eth->h_dest, neigh->ha, dev->addr_len); return 0; } @@ -280,13 +255,7 @@ int eth_header_cache(struct dst_entry *dst, struct neighbour *neigh, void eth_header_cache_update(struct hh_cache *hh, struct device *dev, unsigned char * haddr) { - if (hh->hh_type != ETH_P_IP) - { - printk(KERN_DEBUG "eth_header_cache_update: %04x cache is not implemented\n", hh->hh_type); - return; - } - memcpy(hh->hh_data, haddr, ETH_ALEN); - hh->hh_uptodate = 1; + memcpy(((u8*)hh->hh_data) + 2, haddr, dev->addr_len); } #ifndef CONFIG_IP_ROUTER diff --git a/net/ipv4/.cvsignore b/net/ipv4/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/ipv4/.cvsignore +++ b/net/ipv4/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 2f057ab4a..dbace1d3b 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -5,6 +5,7 @@ bool 'IP: multicasting' CONFIG_IP_MULTICAST bool 'IP: advanced router' CONFIG_IP_ADVANCED_ROUTER if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then define_bool CONFIG_RTNETLINK y + define_bool CONFIG_NETLINK y bool 'IP: policy routing' CONFIG_IP_MULTIPLE_TABLES bool 'IP: equal cost multipath' CONFIG_IP_ROUTE_MULTIPATH bool 'IP: use TOS value as routing key' CONFIG_IP_ROUTE_TOS @@ -26,6 +27,9 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then if [ "$CONFIG_IP_FIREWALL" = "y" ]; then if [ "$CONFIG_NETLINK" = "y" ]; then bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK + if [ "$CONFIG_IP_FIREWALL_NETLINK" = "y" ]; then + define_bool CONFIG_NETLINK_DEV y + fi fi bool 'IP: firewall packet logging' CONFIG_IP_FIREWALL_VERBOSE bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY @@ -33,14 +37,16 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then fi fi bool 'IP: accounting' CONFIG_IP_ACCT -bool 'IP: masquerading' CONFIG_IP_MASQUERADE -if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then - comment 'Protocol-specific masquerading support will be built as modules.' - bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP - comment 'Protocol-specific masquerading support will be built as modules.' - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW - tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW +if [ "$CONFIG_IP_FIREWALL" = "y" ]; then + bool 'IP: masquerading' CONFIG_IP_MASQUERADE + if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' + bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP + comment 'Protocol-specific masquerading support will be built as modules.' + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW + tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW + fi fi fi bool 'IP: optimize as router not host' CONFIG_IP_ROUTER @@ -56,9 +62,9 @@ if [ "$CONFIG_IP_MULTICAST" = "y" ]; then bool 'IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2 fi fi -tristate 'IP: aliasing support' CONFIG_IP_ALIAS +bool 'IP: aliasing support' CONFIG_IP_ALIAS if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - if [ "$CONFIG_NETLINK" = "y" ]; then + if [ "$CONFIG_RTNETLINK" = "y" ]; then bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD fi fi diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ca3ff3213..584ad8c7a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.58 1997/10/29 20:27:21 kuznet Exp $ + * Version: $Id: af_inet.c,v 1.5 1997/12/16 05:37:33 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -150,16 +150,16 @@ static __inline__ void kill_sk_queues(struct sock *sk) */ if (skb->sk != NULL && skb->sk != sk) skb->sk->prot->close(skb->sk, 0); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* Next, the error queue. */ while((skb = skb_dequeue(&sk->error_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); /* Now the backlog. */ while((skb=skb_dequeue(&sk->back_log)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } static __inline__ void kill_sk_now(struct sock *sk) @@ -326,7 +326,15 @@ static int inet_create(struct socket *sock, int protocol) if (sock->type == SOCK_PACKET) { static int warned; if (net_families[AF_PACKET]==NULL) + { +#if defined(CONFIG_KERNELD) && defined(CONFIG_PACKET_MODULE) + char module_name[30]; + sprintf(module_name,"net-pf-%d", AF_PACKET); + request_module(module_name); + if (net_families[AF_PACKET] == NULL) +#endif return -ESOCKTNOSUPPORT; + } if (!warned++) printk(KERN_INFO "%s uses obsolete (AF_INET,SOCK_PACKET)\n", current->comm); return net_families[AF_PACKET]->create(sock, protocol); @@ -828,13 +836,13 @@ int inet_shutdown(struct socket *sock, int how) } -unsigned int inet_poll(struct socket *sock, poll_table *wait) +unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; if (sk->prot->poll == NULL) return(0); - return sk->prot->poll(sock, wait); + return sk->prot->poll(file, sock, wait); } /* @@ -904,29 +912,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFPFLAGS: case SIOCSIFFLAGS: return(devinet_ioctl(cmd,(void *) arg)); - case SIOCGIFCONF: - case SIOCGIFFLAGS: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCGIFMETRIC: - case SIOCSIFMETRIC: - case SIOCGIFMEM: - case SIOCSIFMEM: - case SIOCGIFMTU: - case SIOCSIFMTU: - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCSIFMAP: - case SIOCGIFMAP: - case SIOCSIFSLAVE: - case SIOCGIFSLAVE: - case SIOCGIFINDEX: - case SIOCGIFNAME: - case SIOCGIFCOUNT: - case SIOCSIFHWBROADCAST: - return(dev_ioctl(cmd,(void *) arg)); - case SIOCGIFBR: case SIOCSIFBR: #ifdef CONFIG_BRIDGE @@ -963,9 +948,9 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(dev_ioctl(cmd,(void *) arg)); #endif - if (sk->prot->ioctl==NULL) - return(-EINVAL); - return(sk->prot->ioctl(sk, cmd, arg)); + if (sk->prot->ioctl==NULL || (err=sk->prot->ioctl(sk, cmd, arg))==-ENOIOCTLCMD) + return(dev_ioctl(cmd,(void *) arg)); + return err; } /*NOTREACHED*/ return(0); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 58bb4174a..94ae4263e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.3 1997/12/16 05:37:34 ralf Exp $ + * Version: $Id: arp.c,v 1.4 1998/03/03 01:23:36 ralf Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -53,6 +53,7 @@ * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. @@ -62,6 +63,8 @@ * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. + * Alexey Kuznetsov: new arp state machine; + * now it is in net/core/neighbour.c. */ /* RFC1122 Status: @@ -95,6 +98,9 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #include <net/ip.h> #include <net/icmp.h> @@ -109,1113 +115,298 @@ #include <net/netrom.h> #endif #endif -#ifdef CONFIG_ARPD -#include <net/netlink.h> -#endif #include <asm/system.h> #include <asm/uaccess.h> /* - * Configurable Parameters - */ - -/* - * After that time, an unused entry is deleted from the arp table. - * RFC1122 recommends set it to 60*HZ, if your site uses proxy arp - * and dynamic routing. - */ - -#define ARP_TIMEOUT (60*HZ) - -int sysctl_arp_timeout = ARP_TIMEOUT; - -/* - * How often is ARP cache checked for expire. - * It is useless to set ARP_CHECK_INTERVAL > ARP_TIMEOUT - */ - -#define ARP_CHECK_INTERVAL (60*HZ) - -int sysctl_arp_check_interval = ARP_CHECK_INTERVAL; - -/* - * Soft limit on ARP cache size. - */ - -#if RT_CACHE_DEBUG >= 2 -#define ARP_MAXSIZE 4 -#else -#ifdef CONFIG_ARPD -#define ARP_MAXSIZE 64 -#else -#define ARP_MAXSIZE 256 -#endif /* CONFIG_ARPD */ -#endif - -/* - * Limit on unresolved ARP cache entries. - */ -#define ARP_MAX_UNRES (ARP_MAXSIZE/2) - -/* - * Maximal number of skb's queued for resolution. - */ -#define ARP_MAX_UNRES_PACKETS 3 - -/* - * If an arp request is send, ARP_RES_TIME is the timeout value until the - * next request is send. - * RFC1122: OK. Throttles ARPing, as per 2.3.2.1. (MUST) - * The recommended minimum timeout is 1 second per destination. - * - */ - -#define ARP_RES_TIME (5*HZ) - -int sysctl_arp_res_time = ARP_RES_TIME; - -/* - * The number of times an broadcast arp request is send, until - * the host is considered temporarily unreachable. - */ - -#define ARP_MAX_TRIES 3 - -int sysctl_arp_max_tries = ARP_MAX_TRIES; - -/* - * The entry is reconfirmed by sending point-to-point ARP - * request after ARP_CONFIRM_INTERVAL. - * RFC1122 recommends 60*HZ. - * - * Warning: there exist nodes, that answer only broadcast - * ARP requests (Cisco-4000 in hot standby mode?) - * Now arp code should work with such nodes, but - * it still will generate redundant broadcast requests, so that - * this interval should be enough long. - */ - -#define ARP_CONFIRM_INTERVAL (300*HZ) - -int sysctl_arp_confirm_interval = ARP_CONFIRM_INTERVAL; - -/* - * We wait for answer to unicast request for ARP_CONFIRM_TIMEOUT. - */ - -#define ARP_CONFIRM_TIMEOUT ARP_RES_TIME - -int sysctl_arp_confirm_timeout = ARP_CONFIRM_TIMEOUT; - -/* - * The number of times an unicast arp request is retried, until - * the cache entry is considered suspicious. - * Value 0 means that no unicast pings will be sent. - * RFC1122 recommends 2. - */ - -#define ARP_MAX_PINGS 1 - -int sysctl_arp_max_pings = ARP_MAX_PINGS; - -/* - * When a host is dead, but someone tries to connect it, - * we do not remove corresponding cache entry (it would - * be useless, it will be created again immediately) - * Instead we prolongate interval between broadcasts - * to ARP_DEAD_RES_TIME. - * This interval should be not very long. - * (When the host will be up again, we will notice it only - * when ARP_DEAD_RES_TIME expires, or when the host will arp us. - */ - -#define ARP_DEAD_RES_TIME (60*HZ) - -int sysctl_arp_dead_res_time = ARP_DEAD_RES_TIME; - -static void arp_neigh_destroy(struct neighbour *neigh); - -/* * Interface to generic neighbour cache. */ +static int arp_constructor(struct neighbour *neigh); +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); +static void parp_redo(struct sk_buff *skb); -struct neigh_ops arp_neigh_ops = { +static struct neigh_ops arp_generic_ops = +{ AF_INET, NULL, - arp_find, - arp_neigh_destroy + arp_solicit, + arp_error_report, + neigh_resolve_output, + neigh_connected_output, + ip_acct_output, + ip_acct_output }; - -static atomic_t arp_size = ATOMIC_INIT(0); -static atomic_t arp_unres_size = ATOMIC_INIT(0); - -#ifdef CONFIG_ARPD -static int arpd_not_running; -static int arpd_stamp; -struct sock *arpd_sk; -#endif - -static void arp_check_expire (unsigned long); -static int arp_update (u32 sip, char *sha, struct device * dev, - unsigned long updated, int grat); - -static struct timer_list arp_timer = - { NULL, NULL, ARP_CHECK_INTERVAL, 0L, &arp_check_expire }; - -/* - * The default arp netmask is just 255.255.255.255 which means it's - * a single machine entry. Only proxy entries can have other netmasks - */ - -#define DEF_ARP_NETMASK (~0) - -/* - * The size of the hash table. Must be a power of two. - */ - -#define ARP_TABLE_SIZE 16 -#define FULL_ARP_TABLE_SIZE (ARP_TABLE_SIZE+1) - -struct arp_table *arp_tables[FULL_ARP_TABLE_SIZE] = +static struct neigh_ops arp_hh_ops = { + AF_INET, NULL, + arp_solicit, + arp_error_report, + neigh_resolve_output, + neigh_resolve_output, + ip_acct_output, + ip_acct_output }; -#define arp_proxy_list arp_tables[ARP_TABLE_SIZE] - -/* - * The last bits in the IP address are used for the cache lookup. - * A special entry is used for proxy arp entries - */ - -#define HASH(paddr) (htonl(paddr) & (ARP_TABLE_SIZE - 1)) - -/* - * Hardware header cache. - * - */ - -/* - * Signal to device layer, that hardware address may be changed. - */ - -static __inline__ void arp_update_hhs(struct arp_table * entry) +static struct neigh_ops arp_direct_ops = { - struct hh_cache *hh; - void (*update)(struct hh_cache*, struct device*, unsigned char*) = - entry->u.neigh.dev->header_cache_update; - -#if RT_CACHE_DEBUG >= 1 - if (!update && entry->u.neigh.hh) - { - printk(KERN_DEBUG "arp_update_hhs: no update callback for %s\n", entry->u.neigh.dev->name); - return; - } -#endif - for (hh=entry->u.neigh.hh; hh; hh=hh->hh_next) - update(hh, entry->u.neigh.dev, entry->u.neigh.ha); -} - -/* - * Invalidate all hh's, so that higher level will not try to use it. - */ - -static __inline__ void arp_invalidate_hhs(struct arp_table * entry) -{ - struct hh_cache *hh; - - for (hh=entry->u.neigh.hh; hh; hh=hh->hh_next) - hh->hh_uptodate = 0; -} - -/* - * Purge all linked skb's of the entry. - */ + AF_INET, + NULL, + NULL, + NULL, + ip_acct_output, + ip_acct_output, + ip_acct_output, + ip_acct_output +}; -static void arp_purge_send_q(struct arp_table *entry) +#if defined(CONFIG_AX25) || defined(CONFIG_AX25) || \ + defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE) +struct neigh_ops arp_broken_ops = { - struct sk_buff *skb; - - /* Release the list of `skb' pointers. */ - while ((skb = skb_dequeue(&entry->u.neigh.arp_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); -} + AF_INET, + NULL, + arp_solicit, + arp_error_report, + neigh_compat_output, + neigh_compat_output, + ip_acct_output, + ip_acct_output, +}; +#endif -static void arp_free(struct arp_table **entryp) +struct neigh_table arp_tbl = { - struct arp_table *entry = *entryp; - *entryp = entry->u.next; - - if (!(entry->flags&ATF_PUBL)) { - atomic_dec(&arp_size); - if (!(entry->flags&ATF_COM)) - atomic_dec(&arp_unres_size); - } - del_timer(&entry->timer); - arp_purge_send_q(entry); - arp_invalidate_hhs(entry); - - neigh_destroy(&entry->u.neigh); -} - + NULL, + AF_INET, + sizeof(struct neighbour) + 4, + 4, + arp_constructor, + NULL, + NULL, + parp_redo, + { NULL, NULL, &arp_tbl, 0, NULL, NULL, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 1*HZ, 64 }, + 30*HZ, 128, 512, 1024, +}; -static void arp_neigh_destroy(struct neighbour *neigh) +int arp_mc_map(u32 addr, u8 *haddr, struct device *dev, int dir) { - struct arp_table *entry = (struct arp_table*)neigh; - struct hh_cache *hh, *next; - - del_timer(&entry->timer); - arp_purge_send_q(entry); - - hh = entry->u.neigh.hh; - entry->u.neigh.hh = NULL; - - for ( ; hh; hh = next) - { - next = hh->hh_next; - hh->hh_uptodate = 0; - hh->hh_next = NULL; - if (atomic_dec_and_test(&hh->hh_refcnt)) - { -#if RT_CACHE_DEBUG >= 2 - extern atomic_t hh_count; - atomic_dec(&hh_count); -#endif - kfree_s(hh, sizeof(struct hh_cache)); + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: + case ARPHRD_FDDI: + ip_eth_mc_map(addr, haddr); + return 0; + default: + if (dir) { + memcpy(haddr, dev->broadcast, dev->addr_len); + return 0; } } + return -EINVAL; } -#ifdef CONFIG_ARPD - -/* - * Send ARPD message. - */ -static void arpd_send(int req, u32 addr, struct device * dev, char *ha, - unsigned long updated) -{ - int retval; - struct sk_buff *skb; - struct arpd_request *arpreq; - - if (arpd_not_running) - return; - - skb = alloc_skb(sizeof(struct arpd_request), GFP_ATOMIC); - if (skb == NULL) - return; - - arpreq=(struct arpd_request *)skb_put(skb, sizeof(struct arpd_request)); - arpreq->req = req; - arpreq->ip = addr; - arpreq->dev = (unsigned long)dev; - arpreq->stamp = arpd_stamp; - arpreq->updated = updated; - if (ha) - memcpy(arpreq->ha, ha, sizeof(arpreq->ha)); - - retval = netlink_post(NETLINK_ARPD, skb); - if (retval) - { - kfree_skb(skb, FREE_WRITE); - if (retval == -EUNATCH) - arpd_not_running = 1; - } -} - -/* - * Send ARPD update message. - */ - -static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) -{ - arpd_send(ARPD_UPDATE, ip, dev, ha, jiffies); -} - - -/* - * Send ARPD lookup request. - */ -static __inline__ void arpd_lookup(u32 addr, struct device * dev) +static int arp_constructor(struct neighbour *neigh) { - arpd_send(ARPD_LOOKUP, addr, dev, NULL, 0); -} + u32 addr = *(u32*)neigh->primary_key; + struct device *dev = neigh->dev; + struct in_device *in_dev = dev->ip_ptr; -/* - * Send ARPD flush message. - */ - -static __inline__ void arpd_flush(struct device * dev) -{ - arpd_send(ARPD_FLUSH, 0, dev, NULL, 0); -} - - -static int arpd_callback(struct sk_buff *skb, struct sock *sk) -{ - struct device * dev; - struct arpd_request *retreq; - - arpd_not_running = 0; - - if (skb->len != sizeof(struct arpd_request)) - { - kfree_skb(skb, FREE_READ); + if (in_dev == NULL) return -EINVAL; - } - retreq = (struct arpd_request *)skb->data; - dev = (struct device*)retreq->dev; + neigh->type = inet_addr_type(addr); + if (in_dev->arp_parms) + neigh->parms = in_dev->arp_parms; - if (retreq->stamp != arpd_stamp || !dev) - { - kfree_skb(skb, FREE_READ); - return -EINVAL; - } - - if (!retreq->updated) - { -/* - * Invalid mapping: drop it and send ARP broadcast. - */ - arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, - inet_select_addr(dev, retreq->ip, RT_SCOPE_LINK), - NULL, - dev->dev_addr, NULL); - } - else - { - start_bh_atomic(); - arp_update(retreq->ip, retreq->ha, dev, retreq->updated, 0); - end_bh_atomic(); - } - - kfree_skb(skb, FREE_READ); - return sizeof(struct arpd_request); -} - -#else - -static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) -{ - return; -} - -#endif /* CONFIG_ARPD */ - - - - -/* - * ARP expiration routines. - */ - -/* - * Force the expiry of an entry in the internal cache so the memory - * can be used for a new request. - */ - -static int arp_force_expire(void) -{ - int i; - struct arp_table *entry, **pentry; - struct arp_table **oldest_entry = NULL; - unsigned long oldest_used = ~0; - unsigned long now = jiffies; - int result = 0; - - static int last_index; - - if (last_index >= ARP_TABLE_SIZE) - last_index = 0; - - for (i = 0; i < ARP_TABLE_SIZE; i++, last_index++) - { - pentry = &arp_tables[last_index & (ARP_TABLE_SIZE-1)]; - - while ((entry = *pentry) != NULL) - { - if (!(entry->flags & ATF_PERM)) - { - if (!atomic_read(&entry->u.neigh.refcnt) && - now - entry->u.neigh.lastused > sysctl_arp_timeout) - { -#if RT_CACHE_DEBUG >= 2 - printk("arp_force_expire: %08x expired\n", entry->ip); -#endif - arp_free(pentry); - result++; - if (atomic_read(&arp_size) < ARP_MAXSIZE) - goto done; - continue; - } - if (!atomic_read(&entry->u.neigh.refcnt) && - entry->u.neigh.lastused < oldest_used) - { - oldest_entry = pentry; - oldest_used = entry->u.neigh.lastused; - } - } - pentry = &entry->u.next; - } - } - -done: - if (result || !oldest_entry) - return result; + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &arp_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + /* Good devices (checked by reading texts, but only ethernet is + tested) + + ARPHRD_ETHER: (ethernet, apfddi) + ARPHRD_FDDI: (fddi) + ARPHRD_IEEE802: (tr) + ARPHRD_METRICOM: (strip) + ARPHRD_ARCNET: + etc. etc. etc. + + ARPHRD_IPDDP will also work, if author repaires it. + I did not it, because this driver does not work even + in old paradigm. + */ -#if RT_CACHE_DEBUG >= 2 - printk("arp_force_expire: expiring %08x\n", (*oldest_entry)->ip); +#if 1 + /* So... these "amateur" devices are hopeless. + The only thing, that I can say now: + It is very sad that we need to keep ugly obsolete + code to make them happy. + + They should be moved to more reasonable state, now + they use rebuild_header INSTEAD OF hard_start_xmit!!! + Besides that, they are sort of out of date + (a lot of redundant clones/copies, useless in 2.1), + I wonder why people believe that they work. + */ + switch (dev->type) { + default: + break; + case ARPHRD_ROSE: +#if defined(CONFIG_AX25) || defined(CONFIG_AX25) + case ARPHRD_AX25: +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: #endif - arp_free(oldest_entry); - return 1; -} - -static void arp_unres_expire(void) -{ - int i; - struct arp_table *entry, **pentry; - unsigned long now = jiffies; - - for (i = 0; i < ARP_TABLE_SIZE; i++) { - pentry = &arp_tables[i & (ARP_TABLE_SIZE-1)]; - - while ((entry = *pentry) != NULL) { - if (!(entry->flags & (ATF_PERM|ATF_COM)) && - (entry->retries < sysctl_arp_max_tries || - entry->timer.expires - now < - sysctl_arp_res_time - sysctl_arp_res_time/32)) { - if (!atomic_read(&entry->u.neigh.refcnt)) { -#if RT_CACHE_DEBUG >= 2 - printk("arp_unres_expire: %08x discarded\n", entry->ip); + neigh->ops = &arp_broken_ops; + neigh->output = neigh->ops->output; + return 0; #endif - arp_free(pentry); - continue; - } - arp_purge_send_q(entry); - } - pentry = &entry->u.next; } - } -} - - -/* - * Check if there are entries that are too old and remove them. If the - * ATF_PERM flag is set, they are always left in the arp cache (permanent - * entries). If an entry was not confirmed for ARP_CONFIRM_INTERVAL, - * send point-to-point ARP request. - * If it will not be confirmed for ARP_CONFIRM_TIMEOUT, - * give it to shred by arp_expire_entry. - */ - -static void arp_check_expire(unsigned long dummy) -{ - int i; - unsigned long now = jiffies; - - del_timer(&arp_timer); - -#ifdef CONFIG_ARPD - arpd_not_running = 0; -#endif - - ip_rt_check_expire(); - - for (i = 0; i < ARP_TABLE_SIZE; i++) - { - struct arp_table *entry, **pentry; - - pentry = &arp_tables[i]; - - while ((entry = *pentry) != NULL) - { - if (entry->flags & ATF_PERM) - { - pentry = &entry->u.next; - continue; - } - - if (!atomic_read(&entry->u.neigh.refcnt) && - now - entry->u.neigh.lastused > sysctl_arp_timeout) - { -#if RT_CACHE_DEBUG >= 2 - printk("arp_expire: %08x expired\n", entry->ip); #endif - arp_free(pentry); - continue; - } - if (entry->last_updated && - now - entry->last_updated > sysctl_arp_confirm_interval) - { - struct device * dev = entry->u.neigh.dev; - entry->retries = sysctl_arp_max_tries+sysctl_arp_max_pings; - del_timer(&entry->timer); - entry->timer.expires = jiffies + ARP_CONFIRM_TIMEOUT; - add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, - dev, inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), - entry->u.neigh.ha, dev->dev_addr, NULL); -#if RT_CACHE_DEBUG >= 2 - printk("arp_expire: %08x requires confirmation\n", entry->ip); -#endif - } - pentry = &entry->u.next; /* go to next entry */ + if (neigh->type == RTN_MULTICAST) { + neigh->nud_state = NUD_NOARP; + arp_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); } + if (dev->hard_header_cache) + neigh->ops = &arp_hh_ops; + else + neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; } - /* - * Set the timer again. - */ - - arp_timer.expires = jiffies + sysctl_arp_check_interval; - add_timer(&arp_timer); -} - -/* - * This function is called, if an entry is not resolved in ARP_RES_TIME. - * When more than MAX_ARP_TRIES retries was done, release queued skb's, - * but not discard entry itself if it is in use. - */ - -static void arp_expire_request (unsigned long arg) -{ - struct arp_table *entry = (struct arp_table *) arg; - struct arp_table **pentry; - unsigned long hash; - - del_timer(&entry->timer); - - /* If entry is COMPLETE but old, - * it means that point-to-point ARP ping has been failed - * (It really occurs with Cisco 4000 routers) - * We should reconfirm it. - */ - - if ((entry->flags & ATF_COM) && entry->last_updated - && jiffies - entry->last_updated <= sysctl_arp_confirm_interval) - return; - - if (entry->last_updated && --entry->retries > 0) - { - struct device *dev = entry->u.neigh.dev; - -#if RT_CACHE_DEBUG >= 2 - printk("arp_expire_request: %08x timed out\n", entry->ip); -#endif - /* Set new timer. */ - entry->timer.expires = jiffies + sysctl_arp_res_time; - add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, - inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), - entry->retries > sysctl_arp_max_tries ? entry->u.neigh.ha : NULL, - dev->dev_addr, NULL); - return; - } - - /* - * The host is really dead. - */ - - arp_purge_send_q(entry); - - if (atomic_read(&entry->u.neigh.refcnt)) - { - /* - * The host is dead, but someone refers to it. - * It is useless to drop this entry just now, - * it will be born again, so that - * we keep it, but slow down retransmitting - * to ARP_DEAD_RES_TIME. - */ - - struct device *dev = entry->u.neigh.dev; -#if RT_CACHE_DEBUG >= 2 - printk("arp_expire_request: %08x is dead\n", entry->ip); -#endif - entry->retries = sysctl_arp_max_tries; - if (entry->flags&ATF_COM) - atomic_inc(&arp_unres_size); - entry->flags &= ~ATF_COM; - arp_invalidate_hhs(entry); - - /* - * Declare the entry dead. - */ - entry->last_updated = 0; - - entry->timer.expires = jiffies + sysctl_arp_dead_res_time; - add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, - inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), - NULL, dev->dev_addr, NULL); - return; - } - - entry->last_updated = 0; - - hash = HASH(entry->ip); - - pentry = &arp_tables[hash]; - - while (*pentry != NULL) - { - if (*pentry != entry) - { - pentry = &(*pentry)->u.next; - continue; - } -#if RT_CACHE_DEBUG >= 2 - printk("arp_expire_request: %08x is killed\n", entry->ip); -#endif - arp_free(pentry); - } + return 0; } - -/* - * Allocate memory for a new entry. If we are at the maximum limit - * of the internal ARP cache, arp_force_expire() an entry. - */ - -static struct arp_table * arp_alloc(int how) +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { - struct arp_table * entry; - - if (how && atomic_read(&arp_size) >= ARP_MAXSIZE) - arp_force_expire(); - if (how > 1 && atomic_read(&arp_unres_size) >= ARP_MAX_UNRES) { - arp_unres_expire(); - if (atomic_read(&arp_unres_size) >= ARP_MAX_UNRES) { - printk(KERN_DEBUG "arp_unres_size=%d\n", - atomic_read(&arp_unres_size)); - return NULL; - } - } - - entry = (struct arp_table *)neigh_alloc(sizeof(struct arp_table), - &arp_neigh_ops); - if (entry != NULL) { - atomic_set(&entry->u.neigh.refcnt, 1); - - if (how) - atomic_inc(&arp_size); - - entry->mask = DEF_ARP_NETMASK; - init_timer(&entry->timer); - entry->timer.function = arp_expire_request; - entry->timer.data = (unsigned long)entry; - entry->last_updated = jiffies; - } - return entry; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + kfree_skb(skb); } - - -/* - * Purge a device from the ARP queue - */ - -int arp_device_event(struct notifier_block *this, unsigned long event, void *ptr) +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { - struct device *dev=ptr; - int i; - - if (event != NETDEV_DOWN) - return NOTIFY_DONE; + u32 saddr; + u8 *dst_ha = NULL; + struct device *dev = neigh->dev; + u32 target = *(u32*)neigh->primary_key; + int probes = neigh->probes; + + if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) + saddr = skb->nh.iph->saddr; + else + saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); -#ifdef CONFIG_ARPD - arpd_flush(dev); - arpd_stamp++; + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state&NUD_VALID)) + printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + dst_ha = neigh->ha; + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); #endif - - for (i = 0; i < FULL_ARP_TABLE_SIZE; i++) - { - struct arp_table *entry; - struct arp_table **pentry = &arp_tables[i]; - - start_bh_atomic(); - - while ((entry = *pentry) != NULL) - { - if (entry->u.neigh.dev != dev) - { - pentry = &entry->u.next; - continue; - } - arp_free(pentry); - } - - end_bh_atomic(); + return; } - return NOTIFY_DONE; -} - - - -/* - * This will try to retransmit everything on the queue. - */ - -static void arp_send_q(struct arp_table *entry) -{ - struct sk_buff *skb; - while((skb = skb_dequeue(&entry->u.neigh.arp_queue)) != NULL) - dev_queue_xmit(skb); + arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_ha, dev->dev_addr, NULL); } +/* OBSOLETE FUNCTIONS */ -static int -arp_update (u32 sip, char *sha, struct device * dev, - unsigned long updated, int grat) -{ - struct arp_table * entry; - unsigned long hash; - - if (updated == 0) - { - updated = jiffies; - arpd_update(sip, dev, sha); - } - - hash = HASH(sip); - - for (entry=arp_tables[hash]; entry; entry = entry->u.next) - if (entry->ip == sip && entry->u.neigh.dev == dev) - break; - - if (entry) - { /* - * Entry found; update it only if it is not a permanent entry. - */ - if (!(entry->flags & ATF_PERM)) - { - del_timer(&entry->timer); - entry->last_updated = updated; - if (memcmp(entry->u.neigh.ha, sha, dev->addr_len) != 0) - { - memcpy(entry->u.neigh.ha, sha, dev->addr_len); - if (entry->flags & ATF_COM) - arp_update_hhs(entry); - } - } - - if (!(entry->flags & ATF_COM)) - { -/* - * Switch to complete status. - */ - entry->flags |= ATF_COM; - atomic_dec(&arp_unres_size); - arp_update_hhs(entry); -/* - * Send out waiting packets. - */ - arp_send_q(entry); - } - return 1; - } - -/* - * No entry found. Need to add a new entry to the arp table. + * Find an arp mapping in the cache. If not found, post a request. + * + * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, + * even if it exists. It is supposed that skb->dev was mangled + * by a virtual device (eql, shaper). Nobody but broken devices + * is allowed to use this function, it is scheduled to be removed. --ANK */ - if (grat) - return 0; - - entry = arp_alloc(1); - if (!entry) - return 0; - - entry->ip = sip; - entry->flags = ATF_COM; - memcpy(entry->u.neigh.ha, sha, dev->addr_len); - entry->u.neigh.dev = dev; - entry->hatype = dev->type; - entry->last_updated = updated; - - entry->u.next = arp_tables[hash]; - arp_tables[hash] = entry; - neigh_release(&entry->u.neigh); - return 0; -} - - - -static __inline__ struct arp_table *arp_lookup(u32 paddr, struct device * dev) -{ - struct arp_table *entry; - - for (entry = arp_tables[HASH(paddr)]; entry != NULL; entry = entry->u.next) - if (entry->ip == paddr && entry->u.neigh.dev == dev) - break; - return entry; -} static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev) { - switch (addr_hint) - { - case RTN_LOCAL: - printk(KERN_DEBUG "ARP: arp called for own IP address\n"); - memcpy(haddr, dev->dev_addr, dev->addr_len); - return 1; - case RTN_MULTICAST: - if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802 - || dev->type==ARPHRD_FDDI) - { - u32 taddr; - haddr[0]=0x01; - haddr[1]=0x00; - haddr[2]=0x5e; - taddr=ntohl(paddr); - haddr[5]=taddr&0xff; - taddr=taddr>>8; - haddr[4]=taddr&0xff; - taddr=taddr>>8; - haddr[3]=taddr&0x7f; - return 1; - } - /* - * If a device does not support multicast broadcast the stuff (eg AX.25 for now) - */ - - case RTN_BROADCAST: - memcpy(haddr, dev->broadcast, dev->addr_len); - return 1; + switch (addr_hint) { + case RTN_LOCAL: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; + case RTN_MULTICAST: + arp_mc_map(paddr, haddr, dev, 1); + return 1; + case RTN_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; } return 0; } -static void arp_start_resolution(struct arp_table *entry) -{ - struct device * dev = entry->u.neigh.dev; - - del_timer(&entry->timer); - entry->timer.expires = jiffies + sysctl_arp_res_time; - entry->retries = sysctl_arp_max_tries; - add_timer(&entry->timer); -#ifdef CONFIG_ARPD - if (!arpd_not_running) - arpd_lookup(entry->ip, dev); - else -#endif - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, - inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), NULL, - dev->dev_addr, NULL); -} - -/* - * Create a new unresolved entry. - * - * NOTE: Always make sure no possibility of sleeping is introduced here, - * since nearly all callers are inside of BH atomic. Don't let - * the arp_alloc() fool you, at neigh_alloc() it is using GFP_ATOMIC - * always. - */ - -struct arp_table * arp_new_entry(u32 paddr, struct device *dev, struct sk_buff *skb) -{ - struct arp_table *entry; - unsigned long hash = HASH(paddr); - - entry = arp_alloc(2); - - if (entry != NULL) - { - entry->ip = paddr; - entry->u.neigh.dev = dev; - entry->hatype = dev->type; - - if (skb != NULL) - skb_queue_tail(&entry->u.neigh.arp_queue, skb); - - atomic_inc(&arp_unres_size); - entry->u.next = arp_tables[hash]; - arp_tables[hash] = entry; - arp_start_resolution(entry); - neigh_release(&entry->u.neigh); - } - return entry; -} - - -/* - * Find an arp mapping in the cache. If not found, post a request. - */ - int arp_find(unsigned char *haddr, struct sk_buff *skb) { struct device *dev = skb->dev; u32 paddr; - struct arp_table *entry; + struct neighbour *n; if (!skb->dst) { - printk(KERN_DEBUG "arp_find called with dst==NULL\n"); + printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + kfree_skb(skb); return 1; } paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) { - skb->arp = 1; + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) return 0; - } start_bh_atomic(); + n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); - /* - * Find an entry - */ - entry = arp_lookup(paddr, dev); - - if (entry != NULL) /* It exists */ - { - if (entry->flags & ATF_COM) - { - entry->u.neigh.lastused = jiffies; - memcpy(haddr, entry->u.neigh.ha, dev->addr_len); - skb->arp = 1; + if (n) { + n->used = jiffies; + if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { + memcpy(haddr, n->ha, dev->addr_len); + neigh_release(n); end_bh_atomic(); return 0; } - - /* - * A request was already sent, but no reply yet. Thus - * queue the packet with the previous attempt - */ - - if (entry->last_updated) { - if (entry->u.neigh.arp_queue.qlen < ARP_MAX_UNRES_PACKETS) - skb_queue_tail(&entry->u.neigh.arp_queue, skb); - else - kfree_skb(skb, FREE_WRITE); - } else { - /* If last_updated==0 host is dead, so - * drop skb's and set socket error. - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - kfree_skb(skb, FREE_WRITE); - } - end_bh_atomic(); - return 1; - } - - entry = arp_new_entry(paddr, dev, skb); - - if (entry == NULL) - kfree_skb(skb, FREE_WRITE); - + } else + kfree_skb(skb); + neigh_release(n); end_bh_atomic(); return 1; } -int arp_find_1(unsigned char *haddr, struct dst_entry *dst, - struct neighbour *neigh) -{ - struct rtable *rt = (struct rtable*)dst; - struct device *dev = dst->dev; - u32 paddr = rt->rt_gateway; - struct arp_table *entry; - - if (!neigh) - { - if (rt->rt_type == RTN_MULTICAST && - (dev->type == ARPHRD_ETHER || - dev->type == ARPHRD_IEEE802 || - dev->type == ARPHRD_FDDI)) - { - u32 taddr; - haddr[0]=0x01; - haddr[1]=0x00; - haddr[2]=0x5e; - taddr=ntohl(paddr); - haddr[5]=taddr&0xff; - taddr=taddr>>8; - haddr[4]=taddr&0xff; - taddr=taddr>>8; - haddr[3]=taddr&0x7f; - return 1; - } - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - { - memcpy(haddr, dev->broadcast, dev->addr_len); - return 1; - } - if (rt->rt_flags & RTCF_LOCAL) - { - printk(KERN_DEBUG "ARP: arp called for own IP address\n"); - memcpy(haddr, dev->dev_addr, dev->addr_len); - return 1; - } - return 0; - } +/* END OF OBSOLETE FUNCTIONS */ - start_bh_atomic(); - - entry = (struct arp_table*)neigh; - - if (entry->flags & ATF_COM) - { - entry->u.neigh.lastused = jiffies; - memcpy(haddr, entry->u.neigh.ha, dev->addr_len); - end_bh_atomic(); - return 1; - } - - end_bh_atomic(); - return 0; -} - - -struct neighbour* arp_find_neighbour(struct dst_entry *dst, int resolve) +/* + * Note: requires bh_atomic locking. + */ +int arp_bind_neighbour(struct dst_entry *dst) { - struct rtable *rt = (struct rtable*)dst; - struct device *dev = rt->u.dst.dev; - u32 paddr = rt->rt_gateway; - struct arp_table *entry; - - if (dst->ops->family != AF_INET) - return NULL; - - if ((dev->flags & (IFF_LOOPBACK|IFF_NOARP)) || - (rt->rt_flags & (RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST))) - return NULL; - - start_bh_atomic(); - - /* - * Find an entry - */ - entry = arp_lookup(paddr, dev); - - if (entry != NULL) /* It exists */ - { - atomic_inc(&entry->u.neigh.refcnt); - end_bh_atomic(); - entry->u.neigh.lastused = jiffies; - return (struct neighbour*)entry; - } - - if (!resolve) { - end_bh_atomic(); - return NULL; - } - - entry = arp_new_entry(paddr, dev, NULL); - - if (entry) - atomic_inc(&entry->u.neigh.refcnt); - - end_bh_atomic(); + struct device *dev = dst->dev; - return (struct neighbour*)entry; + if (dev == NULL) + return 0; + if (dst->neighbour == NULL) + dst->neighbour = __neigh_lookup(&arp_tbl, &((struct rtable*)dst)->rt_gateway, dev, 1); + return (dst->neighbour != NULL); } /* @@ -1250,22 +441,22 @@ void arp_send(int type, int ptype, u32 dest_ip, skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) + dev->hard_header_len + 15, GFP_ATOMIC); if (skb == NULL) - { - printk(KERN_DEBUG "ARP: no memory to send an arp packet\n"); return; - } skb_reserve(skb, (dev->hard_header_len+15)&~15); skb->nh.raw = skb->data; arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); - skb->arp = 1; skb->dev = dev; skb->protocol = __constant_htons (ETH_P_ARP); + if (src_hw == NULL) + src_hw = dev->dev_addr; + if (dest_hw == NULL) + dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ - dev->hard_header(skb,dev,ptype,dest_hw?dest_hw:dev->broadcast,src_hw?src_hw:NULL,skb->len); + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len); /* * Fill out the arp protocol part. @@ -1273,24 +464,38 @@ void arp_send(int type, int ptype, u32 dest_ip, * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ -#ifdef CONFIG_FDDI - arp->ar_hrd = (dev->type == ARPHRD_FDDI) ? htons(ARPHRD_ETHER) : htons(dev->type); -#else - arp->ar_hrd = htons(dev->type); -#endif /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ + switch (dev->type) { + default: + arp->ar_hrd = htons(dev->type); + arp->ar_pro = __constant_htons(ETH_P_IP); + break; + #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + arp->ar_hrd = __constant_htons(ARPHRD_AX25); + arp->ar_pro = __constant_htons(AX25_P_IP); + break; + #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) - arp->ar_pro = (dev->type == ARPHRD_AX25 || dev->type == ARPHRD_NETROM) ? htons(AX25_P_IP) : htons(ETH_P_IP); -#else - arp->ar_pro = (dev->type != ARPHRD_AX25) ? htons(ETH_P_IP) : htons(AX25_P_IP); + case ARPHRD_NETROM: + arp->ar_hrd = __constant_htons(ARPHRD_NETROM); + arp->ar_pro = __constant_htons(AX25_P_IP); + break; #endif -#else - arp->ar_pro = __constant_htons(ETH_P_IP); #endif + +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + arp->ar_hrd = __constant_htons(ARPHRD_ETHER); + arp->ar_pro = __constant_htons(ETH_P_IP); + break; +#endif + } + arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); @@ -1308,24 +513,13 @@ void arp_send(int type, int ptype, u32 dest_ip, arp_ptr+=dev->addr_len; memcpy(arp_ptr, &dest_ip, 4); skb->dev = dev; - skb->priority = 0; dev_queue_xmit(skb); } -static __inline__ int arp_check_published(u32 tip, struct device *dev) +static void parp_redo(struct sk_buff *skb) { - struct arp_table *entry; - - for (entry = arp_proxy_list; entry; entry = entry->u.next) { - if (!((entry->ip^tip)&entry->mask) && - ((!entry->u.neigh.dev && - (!(entry->flags & ATF_COM) || entry->hatype == dev->type)) - || entry->u.neigh.dev == dev) ) - break; - } - - return entry && !(entry->flags & ATF_DONTPUB); + arp_rcv(skb, skb->dev, NULL); } /* @@ -1340,6 +534,9 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) unsigned char *sha, *tha; u32 sip, tip; u16 dev_type = dev->type; + int addr_type; + struct in_device *in_dev = dev->ip_ptr; + struct neighbour *n; /* * The hardware length of the packet should match the hardware length @@ -1348,76 +545,59 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * is not from an IP number. We can't currently handle this, so toss * it. */ -#if defined(CONFIG_FDDI) - if (dev_type == ARPHRD_FDDI) - { + if (in_dev == NULL || + arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto out; + + switch (dev_type) { + default: + if (arp->ar_pro != __constant_htons(ETH_P_IP)) + goto out; + if (htons(dev_type) != arp->ar_hrd) + goto out; + break; +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP hardware types * of 1 (Ethernet). However, to be more robust, we'll accept hardware * types of either 1 (Ethernet) or 6 (IEEE 802.2). */ - - if (arp->ar_hln != dev->addr_len || - ((ntohs(arp->ar_hrd) != ARPHRD_ETHER) && (ntohs(arp->ar_hrd) != ARPHRD_IEEE802)) || - dev->flags & IFF_NOARP || - skb->pkt_type == PACKET_OTHERHOST || - arp->ar_pln != 4) + if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) && + arp->ar_hrd != __constant_htons(ARPHRD_IEEE802)) goto out; - } - else - { - if (arp->ar_hln != dev->addr_len || - dev_type != ntohs(arp->ar_hrd) || - dev->flags & IFF_NOARP || - skb->pkt_type == PACKET_OTHERHOST || - arp->ar_pln != 4) + if (arp->ar_pro != __constant_htons(ETH_P_IP)) goto out; - } -#else - if (arp->ar_hln != dev->addr_len || - dev_type != ntohs(arp->ar_hrd) || - dev->flags & IFF_NOARP || - skb->pkt_type == PACKET_OTHERHOST || - arp->ar_pln != 4) - goto out; + break; #endif - -/* - * Another test. - * The logic here is that the protocol being looked up by arp should - * match the protocol the device speaks. If it doesn't, there is a - * problem, so toss the packet. - */ - - switch (dev_type) - { #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) - case ARPHRD_AX25: - if(arp->ar_pro != htons(AX25_P_IP)) - goto out; - break; -#endif + case ARPHRD_AX25: + if (arp->ar_pro != __constant_htons(AX25_P_IP)) + goto out; + if (arp->ar_hrd != __constant_htons(ARPHRD_AX25)) + goto out; + break; #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) - case ARPHRD_NETROM: - if(arp->ar_pro != htons(AX25_P_IP)) - goto out; - break; -#endif - case ARPHRD_ETHER: - case ARPHRD_ARCNET: - case ARPHRD_METRICOM: - case ARPHRD_IEEE802: - case ARPHRD_FDDI: - case ARPHRD_IPGRE: - if(arp->ar_pro != htons(ETH_P_IP)) - goto out; - break; - - default: - printk(KERN_ERR "ARP: dev->type mangled!\n"); + case ARPHRD_NETROM: + if (arp->ar_pro != __constant_htons(AX25_P_IP)) + goto out; + if (arp->ar_hrd != __constant_htons(ARPHRD_NETROM)) goto out; + break; +#endif +#endif } + /* Undertsand only these message types */ + + if (arp->ar_op != __constant_htons(ARPOP_REPLY) && + arp->ar_op != __constant_htons(ARPOP_REQUEST)) + goto out; + /* * Extract fields */ @@ -1451,32 +631,87 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * and in the case of requests for us we add the requester to the arp * cache. */ - if (arp->ar_op == htons(ARPOP_REQUEST)) { - int addr_type; - struct in_device *in_dev = dev->ip_ptr; - if (ip_route_input(skb, tip, sip, 0, dev)) - goto out; + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ + if (sip == 0) { + if (arp->ar_op == __constant_htons(ARPOP_REQUEST) && + inet_addr_type(tip) == RTN_LOCAL) + arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr); + goto out; + } + + if (arp->ar_op == __constant_htons(ARPOP_REQUEST) && + ip_route_input(skb, tip, sip, 0, dev) == 0) { + rt = (struct rtable*)skb->dst; addr_type = rt->rt_type; - if (addr_type == RTN_LOCAL || (rt->rt_flags&RTCF_DNAT) || - (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - ((in_dev && IN_DEV_PROXY_ARP(in_dev) && IN_DEV_FORWARD(in_dev)) || - arp_check_published(tip, dev)))) - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); - } else { - if (arp->ar_op != htons(ARPOP_REPLY) || - inet_addr_type(sip) != RTN_UNICAST) + if (addr_type == RTN_LOCAL) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + neigh_release(n); + } goto out; + } else if (IN_DEV_FORWARD(in_dev)) { + if ((rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + neigh_release(n); + + if (skb->stamp.tv_sec == 0 || + skb->pkt_type == PACKET_HOST || + in_dev->arp_parms->proxy_delay == 0) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + return 0; + } + goto out; + } + } } - start_bh_atomic(); - arp_update(sip, sha, dev, 0, arp->ar_op == htons(ARPOP_REPLY)); - end_bh_atomic(); + /* Update our ARP tables */ + + n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + +#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + arp->ar_op == __constant_htons(ARPOP_REPLY) && + inet_addr_type(sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, -1); +#endif + + if (n) { + int state = NUD_REACHABLE; + int override = 0; + + /* If several different ARP replies follows back-to-back, + use the FIRST one. It is possible, if several proxy + agents are active. Taking the first reply prevents + arp trashing and chooses the fastest router. + */ + if (jiffies - n->updated >= n->parms->locktime) + override = 1; + + /* Broadcast replies and request packets + do not assert neighbour reachability. + */ + if (arp->ar_op != __constant_htons(ARPOP_REPLY) || + skb->pkt_type != PACKET_HOST) + state = NUD_STALE; + neigh_update(n, sha, state, override, 1); + neigh_release(n); + } out: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1492,175 +727,72 @@ out: int arp_req_set(struct arpreq *r, struct device * dev) { - struct arp_table *entry, **entryp; - struct sockaddr_in *si; - unsigned char *ha = NULL; - u32 ip; - u32 mask = DEF_ARP_NETMASK; - - /* - * Extract netmask (if supplied). - */ - - if (r->arp_flags&ATF_NETMASK) - { - si = (struct sockaddr_in *) &r->arp_netmask; - mask = si->sin_addr.s_addr; - } - - /* - * Extract destination. - */ - - si = (struct sockaddr_in *) &r->arp_pa; - ip = si->sin_addr.s_addr; + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err; - if (r->arp_flags&ATF_PUBL) - { - if (ip & ~mask) + if (r->arp_flags&ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask && mask != 0xFFFFFFFF) return -EINVAL; - if (!dev && (r->arp_flags & ATF_COM)) - { + if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } + if (mask) { + if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + if (dev == NULL) { + ipv4_devconf.proxy_arp = 1; + return 0; + } + if (dev->ip_ptr) { + ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 1; + return 0; + } + return -ENXIO; } - else - { - struct rtable * rt; - int err; - if ((r->arp_flags & ATF_PERM) && !(r->arp_flags & ATF_COM)) - r->arp_flags |= ATF_COM; - - err = ip_route_output(&rt, ip, 0, 1, dev ? dev->ifindex : 0); - if (err) + if (r->arp_flags & ATF_PERM) + r->arp_flags |= ATF_COM; + if (dev == NULL) { + struct rtable * rt; + if ((err = ip_route_output(&rt, ip, 0, 1, 0)) != 0) return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); if (!dev) - dev = rt->u.dst.dev; - if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { - if (rt->rt_flags&RTCF_BROADCAST && - dev->type == ARPHRD_METRICOM && - r->arp_ha.sa_family == ARPHRD_METRICOM) { - memcpy(dev->broadcast, r->arp_ha.sa_data, dev->addr_len); - ip_rt_put(rt); - return 0; - } - ip_rt_put(rt); return -EINVAL; - } - ip_rt_put(rt); } - - if (dev && (dev->flags&(IFF_LOOPBACK|IFF_NOARP))) - return -ENODEV; - - if (dev && r->arp_ha.sa_family != dev->type) + if (r->arp_ha.sa_family != dev->type) return -EINVAL; + err = -ENOBUFS; start_bh_atomic(); - - if (!(r->arp_flags & ATF_PUBL)) - entryp = &arp_tables[HASH(ip)]; - else - entryp = &arp_proxy_list; - - while ((entry = *entryp) != NULL) - { - if (entry->mask == mask) - break; - if ((entry->mask & mask) != mask) - break; - entryp = &entry->u.next; - } - while ((entry = *entryp) != NULL && entry->mask == mask) - { - if (entry->ip == ip) - break; - entryp = &entry->u.next; - } - while ((entry = *entryp) != NULL && entry->mask == mask && - entry->ip == ip) - { - if (!entry->u.neigh.dev || entry->u.neigh.dev == dev) - break; - entryp = &entry->u.next; - } - - while ((entry = *entryp) != NULL) - { - if (entry->ip != ip || entry->mask != mask || - entry->u.neigh.dev != dev) - { - entry = NULL; - break; - } - if (entry->hatype == r->arp_ha.sa_family && - (!(r->arp_flags & ATF_MAGIC) || - entry->flags == r->arp_flags)) - break; - entryp = &entry->u.next; - } - - if (entry) - atomic_inc(&entry->u.neigh.refcnt); - else - { - entry = arp_alloc(r->arp_flags&ATF_PUBL ? 0 : 1); - if (entry == NULL) - { - end_bh_atomic(); - return -ENOMEM; - } - entry->ip = ip; - entry->u.neigh.dev = dev; - entry->mask = mask; - - if (dev) - entry->hatype = dev->type; - - entry->u.next = *entryp; - *entryp = entry; - } - entry->flags = r->arp_flags; - if (!(entry->flags&(ATF_PUBL|ATF_COM))) - atomic_inc(&arp_unres_size); - - if (entry->flags & ATF_PUBL) - { - if (entry->flags & ATF_COM) - { - entry->hatype = r->arp_ha.sa_family; - ha = r->arp_ha.sa_data; - } - else if (dev) - ha = dev->dev_addr; - } - else - ha = r->arp_ha.sa_data; - - if (ha) - memcpy(entry->u.neigh.ha, ha, dev ? dev->addr_len : MAX_ADDR_LEN); - else - memset(entry->u.neigh.ha, 0, MAX_ADDR_LEN); - - entry->last_updated = entry->u.neigh.lastused = jiffies; - - if (!(entry->flags & ATF_PUBL)) - { - if (entry->flags & ATF_COM) - { - arpd_update(entry->ip, entry->u.neigh.dev, ha); - arp_update_hhs(entry); - } - else - arp_start_resolution(entry); + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1); + if (neigh) { + unsigned state = NUD_STALE; + if (r->arp_flags & ATF_PERM) + state = NUD_PERMANENT; + err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + r->arp_ha.sa_data : NULL, state, 1, 0); + neigh_release(neigh); } - - neigh_release(&entry->u.neigh); end_bh_atomic(); - return 0; + return err; +} + +static unsigned arp_state_to_flags(struct neighbour *neigh) +{ + unsigned flags = 0; + if (neigh->nud_state&NUD_PERMANENT) + flags = ATF_PERM|ATF_COM; + else if (neigh->nud_state&NUD_VALID) + flags = ATF_COM; + return flags; } /* @@ -1669,97 +801,57 @@ int arp_req_set(struct arpreq *r, struct device * dev) static int arp_req_get(struct arpreq *r, struct device *dev) { - struct arp_table *entry; - struct sockaddr_in *si; - u32 mask = DEF_ARP_NETMASK; - - if (r->arp_flags&ATF_NETMASK) - { - si = (struct sockaddr_in *) &r->arp_netmask; - mask = si->sin_addr.s_addr; - } - - si = (struct sockaddr_in *) &r->arp_pa; + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err = -ENXIO; start_bh_atomic(); - - if (!(r->arp_flags & ATF_PUBL)) - entry = arp_tables[HASH(si->sin_addr.s_addr)]; - else - entry = arp_proxy_list; - - for ( ; entry ;entry = entry->u.next) - { - if (entry->ip == si->sin_addr.s_addr && - (!(r->arp_flags&ATF_NETMASK) || entry->mask == mask) && - ( (r->arp_flags&ATF_PUBL) ? - (entry->u.neigh.dev == dev && entry->hatype == r->arp_ha.sa_family) - : (entry->u.neigh.dev == dev || !dev))) - { - if (entry->u.neigh.dev) - { - memcpy(r->arp_ha.sa_data, entry->u.neigh.ha, entry->u.neigh.dev->addr_len); - r->arp_ha.sa_family = entry->u.neigh.dev->type; - strncpy(r->arp_dev, entry->u.neigh.dev->name, sizeof(r->arp_dev)); - } - else - { - r->arp_ha.sa_family = entry->hatype; - memset(r->arp_ha.sa_data, 0, sizeof(r->arp_ha.sa_data)); - } - r->arp_flags = entry->flags; - end_bh_atomic(); - return 0; - } + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + if (neigh) { + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_ha.sa_family = dev->type; + strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + r->arp_flags = arp_state_to_flags(neigh); + neigh_release(neigh); + err = 0; } - end_bh_atomic(); - return -ENXIO; + return err; } int arp_req_delete(struct arpreq *r, struct device * dev) { - struct sockaddr_in *si; - struct arp_table *entry, **entryp; - int retval = -ENXIO; - u32 mask = DEF_ARP_NETMASK; - - if (r->arp_flags&ATF_NETMASK) - { - si = (struct sockaddr_in *) &r->arp_netmask; - mask = si->sin_addr.s_addr; - } - - si = (struct sockaddr_in *) &r->arp_pa; - - start_bh_atomic(); - - if (!(r->arp_flags & ATF_PUBL)) - entryp = &arp_tables[HASH(si->sin_addr.s_addr)]; - else - entryp = &arp_proxy_list; - - while ((entry = *entryp) != NULL) - { - if (entry->ip == si->sin_addr.s_addr - && (!(r->arp_flags&ATF_NETMASK) || entry->mask == mask) - && (entry->u.neigh.dev == dev || (!(r->arp_flags&ATF_PUBL) && !dev)) - && (!(r->arp_flags&ATF_MAGIC) || r->arp_flags == entry->flags)) - { - if (!atomic_read(&entry->u.neigh.refcnt)) - { - arp_free(entryp); - retval = 0; - continue; + int err; + u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + + if (r->arp_flags & ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask == 0xFFFFFFFF) + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { + ipv4_devconf.proxy_arp = 0; + return 0; + } + if (dev->ip_ptr) { + ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 0; + return 0; } - if (retval) - retval = -EBUSY; + return -ENXIO; } - entryp = &entry->u.next; + return -EINVAL; } + err = -ENXIO; + start_bh_atomic(); + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + if (neigh) { + err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); + neigh_release(neigh); + } end_bh_atomic(); - return retval; + return err; } /* @@ -1772,8 +864,7 @@ int arp_ioctl(unsigned int cmd, void *arg) struct arpreq r; struct device * dev = NULL; - switch(cmd) - { + switch(cmd) { case SIOCDARP: case SIOCSARP: if (!suser()) @@ -1791,41 +882,53 @@ int arp_ioctl(unsigned int cmd, void *arg) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && - (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB|ATF_MAGIC))) + (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) return -EINVAL; if (!(r.arp_flags & ATF_NETMASK)) - ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr=DEF_ARP_NETMASK; + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr=__constant_htonl(0xFFFFFFFFUL); - if (r.arp_dev[0]) - { + rtnl_lock(); + if (r.arp_dev[0]) { + err = -ENODEV; if ((dev = dev_get(r.arp_dev)) == NULL) - return -ENODEV; + goto out; + /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ if (!r.arp_ha.sa_family) r.arp_ha.sa_family = dev->type; + err = -EINVAL; if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) - return -EINVAL; + goto out; + } else if (cmd != SIOCSARP) { + /* dev has not been set ... */ + printk(KERN_ERR "arp_ioctl: invalid, null device\n"); + err = -EINVAL; + goto out; } - switch(cmd) - { - case SIOCDARP: - return arp_req_delete(&r, dev); - case SIOCSARP: - return arp_req_set(&r, dev); - case SIOCGARP: - err = arp_req_get(&r, dev); - if (!err) - err = copy_to_user(arg, &r, sizeof(r)); - return err; + switch(cmd) { + case SIOCDARP: + err = arp_req_delete(&r, dev); + break; + case SIOCSARP: + /* This checks for dev == NULL */ + err = arp_req_set(&r, dev); + break; + case SIOCGARP: + err = arp_req_get(&r, dev); + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; + break; } - /*NOTREACHED*/ - return 0; +out: + rtnl_unlock(); + return err; } /* * Write the contents of the ARP cache to a PROCfs file. */ +#ifdef CONFIG_PROC_FS #define HBUFFERLEN 30 @@ -1834,7 +937,6 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy int len=0; off_t pos=0; int size; - struct arp_table *entry; char hbuffer[HBUFFERLEN]; int i,j,k; const char hexbuf[] = "0123456789ABCDEF"; @@ -1844,90 +946,113 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy pos+=size; len+=size; + neigh_table_lock(&arp_tbl); - for(i=0; i<FULL_ARP_TABLE_SIZE; i++) - { - start_bh_atomic(); + for(i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n; + for (n=arp_tbl.hash_buckets[i]; n; n=n->next) { + struct device *dev = n->dev; + int hatype = dev->type; - for(entry=arp_tables[i]; entry!=NULL; entry=entry->u.next) - { + /* I'd get great pleasure deleting + this ugly code. Let's output it in hexadecimal format. + "arp" utility will eventually repaired --ANK + */ +#if 1 /* UGLY CODE */ /* * Convert hardware address to XX:XX:XX:XX ... form. */ #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) -#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) - if (entry->hatype == ARPHRD_AX25 || entry->hatype == ARPHRD_NETROM) - strcpy(hbuffer,ax2asc((ax25_address *)entry->u.neigh.ha)); - else { -#else - if(entry->hatype==ARPHRD_AX25) - strcpy(hbuffer,ax2asc((ax25_address *)entry->u.neigh.ha)); + if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) + strcpy(hbuffer,ax2asc((ax25_address *)n->ha)); else { #endif -#endif - - if (entry->u.neigh.dev) - { - for(k=0,j=0;k<HBUFFERLEN-3 && j<entry->u.neigh.dev->addr_len;j++) - { - hbuffer[k++]=hexbuf[ (entry->u.neigh.ha[j]>>4)&15 ]; - hbuffer[k++]=hexbuf[ entry->u.neigh.ha[j]&15 ]; - hbuffer[k++]=':'; - } - hbuffer[--k]=0; + for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) { + hbuffer[k++]=hexbuf[(n->ha[j]>>4)&15 ]; + hbuffer[k++]=hexbuf[n->ha[j]&15 ]; + hbuffer[k++]=':'; } - else - strcpy(hbuffer, "00:00:00:00:00:00"); + hbuffer[--k]=0; #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) - } + } +#endif +#else + if ((neigh->nud_state&NUD_VALID) && dev->addr_len) { + int j; + for (j=0; j < dev->addr_len; j++) + sprintf(hbuffer+2*j, "%02x", neigh->ha[j]); + } else + sprintf(hbuffer, "0"); #endif size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", - in_ntoa(entry->ip), - entry->hatype, - entry->flags, + in_ntoa(*(u32*)n->primary_key), + hatype, + arp_state_to_flags(n), hbuffer); -#if RT_CACHE_DEBUG < 2 size += sprintf(buffer+len+size, " %-17s %s\n", - entry->mask==DEF_ARP_NETMASK ? - "*" : in_ntoa(entry->mask), - entry->u.neigh.dev ? entry->u.neigh.dev->name : "*"); -#else + "*", dev->name); + + len += size; + pos += size; + + if (pos <= offset) + len=0; + if (pos >= offset+length) + goto done; + } + } + + for (i=0; i<=PNEIGH_HASHMASK; i++) { + struct pneigh_entry *n; + for (n=arp_tbl.phash_buckets[i]; n; n=n->next) { + struct device *dev = n->dev; + int hatype = dev ? dev->type : 0; + + size = sprintf(buffer+len, + "%-17s0x%-10x0x%-10x%s", + in_ntoa(*(u32*)n->key), + hatype, + ATF_PUBL|ATF_PERM, + "00:00:00:00:00:00"); size += sprintf(buffer+len+size, - " %-17s %s\t%d\t%d\t%1d\n", - entry->mask==DEF_ARP_NETMASK ? - "*" : in_ntoa(entry->mask), - entry->u.neigh.dev ? entry->u.neigh.dev->name : "*", - atomic_read(&entry->u.neigh.refcnt), - entry->u.neigh.hh ? atomic_read(&entry->u.neigh.hh->hh_refcnt) : -1, - entry->u.neigh.hh ? entry->u.neigh.hh->hh_uptodate : 0); -#endif - + " %-17s %s\n", + "*", dev ? dev->name : "*"); + len += size; pos += size; if (pos <= offset) len=0; if (pos >= offset+length) - { - end_bh_atomic(); goto done; - } } - end_bh_atomic(); } + done: + neigh_table_unlock(&arp_tbl); *start = buffer+len-(pos-offset); /* Start of wanted data */ len = pos-offset; /* Start slop */ if (len>length) len = length; /* Ending slop */ + if (len<0) + len = 0; return len; } +#endif +/* Note, that it is not on notifier chain. + It is necessary, that this routine was called after route cache will be + flushed. + */ +void arp_ifdown(struct device *dev) +{ + neigh_ifdown(&arp_tbl, dev); +} /* @@ -1943,12 +1068,6 @@ static struct packet_type arp_packet_type = NULL }; -static struct notifier_block arp_dev_notifier={ - arp_device_event, - NULL, - 0 -}; - #ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_arp = { PROC_NET_ARP, 3, "arp", @@ -1960,18 +1079,15 @@ static struct proc_dir_entry proc_net_arp = { __initfunc(void arp_init (void)) { + neigh_table_init(&arp_tbl); + dev_add_pack(&arp_packet_type); - /* Start with the regular checks for expired arp entries. */ - add_timer(&arp_timer); - /* Register for device down reports */ - register_netdevice_notifier(&arp_dev_notifier); #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_arp); #endif - -#ifdef CONFIG_ARPD - arpd_sk = netlink_kernel_create(NETLINK_ARPD, arpd_callback); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4"); #endif } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 269361e35..7d5f0021f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.14 1997/10/10 22:40:44 davem Exp $ + * Version: $Id: devinet.c,v 1.3 1997/12/16 05:37:35 ralf Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -46,6 +46,9 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #ifdef CONFIG_KERNELD #include <linux/kerneld.h> #endif @@ -54,6 +57,9 @@ #include <net/route.h> #include <net/ip_fib.h> +struct ipv4_devconf ipv4_devconf = { 1, 1, 1, 1, 0, }; +static struct ipv4_devconf ipv4_devconf_dflt = { 1, 1, 1, 1, 1, }; + #ifdef CONFIG_RTNETLINK static void rtmsg_ifa(int event, struct in_ifaddr *); #else @@ -62,7 +68,10 @@ static void rtmsg_ifa(int event, struct in_ifaddr *); static struct notifier_block *inetaddr_chain; static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); - +#ifdef CONFIG_SYSCTL +static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p); +static void devinet_sysctl_unregister(struct ipv4_devconf *p); +#endif int inet_ifa_count; int inet_dev_count; @@ -95,9 +104,22 @@ struct in_device *inetdev_init(struct device *dev) return NULL; inet_dev_count++; memset(in_dev, 0, sizeof(*in_dev)); + memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + in_dev->cnf.sysctl = NULL; in_dev->dev = dev; + if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) { + kfree(in_dev); + return NULL; + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4"); +#endif dev->ip_ptr = in_dev; - ip_mc_init_dev(in_dev); +#ifdef CONFIG_SYSCTL + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + if (dev->flags&IFF_UP) + ip_mc_up(in_dev); return in_dev; } @@ -112,7 +134,11 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); +#endif in_dev->dev->ip_ptr = NULL; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); kfree(in_dev); } @@ -201,8 +227,10 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) } } - if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) { + net_srandom(ifa->ifa_local); ifap = last_primary; + } cli(); ifa->ifa_next = *ifap; @@ -263,7 +291,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 ma int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - struct kern_ifa *k_ifa = arg; + struct rtattr **rta = arg; struct in_device *in_dev; struct ifaddrmsg *ifm = NLMSG_DATA(nlh); struct in_ifaddr *ifa, **ifap; @@ -272,11 +300,11 @@ inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -EADDRNOTAVAIL; for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) { - if ((k_ifa->ifa_local && memcmp(k_ifa->ifa_local, &ifa->ifa_local, 4)) || - (k_ifa->ifa_label && strcmp(k_ifa->ifa_label, ifa->ifa_label)) || - (k_ifa->ifa_address && + if ((rta[IFA_LOCAL-1] && memcmp(RTA_DATA(rta[IFA_LOCAL-1]), &ifa->ifa_local, 4)) || + (rta[IFA_LABEL-1] && strcmp(RTA_DATA(rta[IFA_LABEL-1]), ifa->ifa_label)) || + (rta[IFA_ADDRESS-1] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || - !inet_ifa_match(*(u32*)k_ifa->ifa_address, ifa)))) + !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS-1]), ifa)))) continue; inet_del_ifa(in_dev, ifap, 1); return 0; @@ -288,13 +316,13 @@ inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - struct kern_ifa *k_ifa = arg; + struct rtattr **rta = arg; struct device *dev; struct in_device *in_dev; struct ifaddrmsg *ifm = NLMSG_DATA(nlh); struct in_ifaddr *ifa; - if (ifm->ifa_prefixlen > 32 || k_ifa->ifa_local == NULL) + if (ifm->ifa_prefixlen > 32 || rta[IFA_LOCAL-1] == NULL) return -EINVAL; if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL) @@ -309,21 +337,21 @@ inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if ((ifa = inet_alloc_ifa()) == NULL) return -ENOBUFS; - if (k_ifa->ifa_address == NULL) - k_ifa->ifa_address = k_ifa->ifa_local; - memcpy(&ifa->ifa_local, k_ifa->ifa_local, 4); - memcpy(&ifa->ifa_address, k_ifa->ifa_address, 4); + if (rta[IFA_ADDRESS-1] == NULL) + rta[IFA_ADDRESS-1] = rta[IFA_LOCAL-1]; + memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL-1]), 4); + memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS-1]), 4); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - if (k_ifa->ifa_broadcast) - memcpy(&ifa->ifa_broadcast, k_ifa->ifa_broadcast, 4); - if (k_ifa->ifa_anycast) - memcpy(&ifa->ifa_anycast, k_ifa->ifa_anycast, 4); + if (rta[IFA_BROADCAST-1]) + memcpy(&ifa->ifa_broadcast, RTA_DATA(rta[IFA_BROADCAST-1]), 4); + if (rta[IFA_ANYCAST-1]) + memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST-1]), 4); ifa->ifa_flags = ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; - if (k_ifa->ifa_label) - memcpy(ifa->ifa_label, k_ifa->ifa_label, IFNAMSIZ); + if (rta[IFA_LABEL-1]) + memcpy(ifa->ifa_label, RTA_DATA(rta[IFA_LABEL-1]), IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); @@ -394,7 +422,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ - case SIOCGIFPFLAGS: /* Get per device sysctl controls */ /* Note that this ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) @@ -413,7 +440,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - case SIOCSIFPFLAGS: /* Set per device sysctl controls */ if (!suser()) return -EACCES; if (sin->sin_family != AF_INET) @@ -464,10 +490,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) sin->sin_addr.s_addr = ifa->ifa_mask; goto rarok; - case SIOCGIFPFLAGS: - ifr.ifr_flags = in_dev->flags; - goto rarok; - case SIOCSIFFLAGS: #ifdef CONFIG_IP_ALIAS if (colon) { @@ -483,10 +505,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) ret = dev_change_flags(dev, ifr.ifr_flags); break; - case SIOCSIFPFLAGS: - in_dev->flags = ifr.ifr_flags; - break; - case SIOCSIFADDR: /* Set interface address (and family) */ if (inet_abc_len(sin->sin_addr.s_addr) < 0) { ret = -EINVAL; @@ -592,7 +610,7 @@ inet_gifconf(struct device *dev, char *buf, int len) done += sizeof(ifr); continue; } - if (len < sizeof(ifr)) + if (len < (int) sizeof(ifr)) return done; memset(&ifr, 0, sizeof(struct ifreq)); if (ifa->ifa_label) @@ -704,7 +722,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm = NLMSG_DATA(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; - ifm->ifa_flags = ifa->ifa_flags; + ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (ifa->ifa_prefixlen) @@ -722,7 +740,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nlmsg_failure: rtattr_failure: - skb_put(skb, b - skb->tail); + skb_trim(skb, b - skb->data); return -1; } @@ -770,7 +788,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr * ifa) return; } if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { - kfree_skb(skb, 0); + kfree_skb(skb); netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); return; } @@ -783,7 +801,7 @@ static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = { { NULL, NULL, }, { NULL, NULL, }, - { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, { NULL, NULL, }, { inet_rtm_newaddr, NULL, }, @@ -816,6 +834,145 @@ static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = #endif /* CONFIG_RTNETLINK */ + +#ifdef CONFIG_SYSCTL + +void inet_forward_change() +{ + struct device *dev; + int on = ipv4_devconf.forwarding; + + ipv4_devconf.accept_redirects = !on; + ipv4_devconf_dflt.forwarding = on; + + for (dev = dev_base; dev; dev = dev->next) { + struct in_device *in_dev = dev->ip_ptr; + if (in_dev) + in_dev->cnf.forwarding = on; + } + + rt_cache_flush(0); + + ip_statistics.IpForwarding = on ? 1 : 2; +} + +static +int devinet_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write && *valp != val) { + if (valp == &ipv4_devconf.forwarding) + inet_forward_change(); + else if (valp != &ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + + return ret; +} + +static struct devinet_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table devinet_vars[12]; + ctl_table devinet_dev[2]; + ctl_table devinet_conf_dir[2]; + ctl_table devinet_proto_dir[2]; + ctl_table devinet_root_dir[2]; +} devinet_sysctl = { + NULL, + {{NET_IPV4_CONF_FORWARDING, "forwarding", + &ipv4_devconf.forwarding, sizeof(int), 0644, NULL, + &devinet_sysctl_forward}, + {NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding", + &ipv4_devconf.mc_forwarding, sizeof(int), 0444, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects", + &ipv4_devconf.accept_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects", + &ipv4_devconf.secure_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SHARED_MEDIA, "shared_media", + &ipv4_devconf.shared_media, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_RP_FILTER, "rp_filter", + &ipv4_devconf.rp_filter, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects", + &ipv4_devconf.send_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route", + &ipv4_devconf.accept_source_route, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_PROXY_ARP, "proxy_arp", + &ipv4_devconf.proxy_arp, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay", + &ipv4_devconf.bootp_relay, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_LOG_MARTIANS, "log_martians", + &ipv4_devconf.log_martians, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, devinet_sysctl.devinet_vars},{0}}, + {{NET_IPV4_CONF, "conf", NULL, 0, 0555, devinet_sysctl.devinet_dev},{0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, devinet_sysctl.devinet_conf_dir},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, devinet_sysctl.devinet_proto_dir},{0}} +}; + +static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p) +{ + int i; + struct device *dev = in_dev ? in_dev->dev : NULL; + struct devinet_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i=0; i<sizeof(t->devinet_vars)/sizeof(t->devinet_vars[0])-1; i++) { + t->devinet_vars[i].data += (char*)p - (char*)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + if (dev) { + t->devinet_dev[0].procname = dev->name; + t->devinet_dev[0].ctl_name = dev->ifindex; + } else { + t->devinet_dev[0].procname = "default"; + t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; + t->devinet_conf_dir[0].child = t->devinet_dev; + t->devinet_conf_dir[0].de = NULL; + t->devinet_proto_dir[0].child = t->devinet_conf_dir; + t->devinet_proto_dir[0].de = NULL; + t->devinet_root_dir[0].child = t->devinet_proto_dir; + t->devinet_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (t->sysctl_header == NULL) + kfree(t); +} + +static void devinet_sysctl_unregister(struct ipv4_devconf *p) +{ + if (p->sysctl) { + struct devinet_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} +#endif + #ifdef CONFIG_IP_PNP_BOOTP /* @@ -856,4 +1013,9 @@ __initfunc(void devinet_init(void)) #ifdef CONFIG_RTNETLINK rtnetlink_links[AF_INET] = inet_rtnetlink_table; #endif +#ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); + devinet_sysctl_register(NULL, &ipv4_devconf_dflt); +#endif } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8775c43bf..409db8209 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.4 1997/11/09 20:05:23 kuznet Exp $ + * Version: $Id: fib_frontend.c,v 1.6 1997/12/13 21:52:48 kuznet Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -93,7 +93,7 @@ void fib_flush(void) #endif /* CONFIG_IP_MULTIPLE_TABLES */ if (flushed) - rt_cache_flush(RT_FLUSH_DELAY); + rt_cache_flush(-1); } @@ -290,27 +290,51 @@ int ip_rt_ioctl(unsigned int cmd, void *arg) #ifdef CONFIG_RTNETLINK +static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for (i=1; i<=RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4) + return -EINVAL; +#ifndef CONFIG_RTNL_OLD_IFINFO + if (i != RTA_MULTIPATH && i != RTA_METRICS) +#endif + rta[i-1] = (struct rtattr*)RTA_DATA(attr); + } + } + return 0; +} + int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_table * tb; - struct kern_rta *rta = arg; + struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + if (inet_check_attr(r, rta)) + return -EINVAL; + tb = fib_get_table(r->rtm_table); if (tb) - return tb->tb_delete(tb, r, rta, nlh, &NETLINK_CB(skb)); + return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); return -ESRCH; } int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_table * tb; - struct kern_rta *rta = arg; + struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + if (inet_check_attr(r, rta)) + return -EINVAL; + tb = fib_new_table(r->rtm_table); if (tb) - return tb->tb_insert(tb, r, rta, nlh, &NETLINK_CB(skb)); + return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); return -ENOBUFS; } @@ -370,7 +394,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr req.nlh.nlmsg_len = sizeof(req); req.nlh.nlmsg_type = cmd; - req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; req.nlh.nlmsg_pid = 0; req.nlh.nlmsg_seq = 0; @@ -477,7 +501,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) First of all, we scan fib_info list searching for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down(ifa->ifa_local, NULL)) + if (fib_sync_down(ifa->ifa_local, NULL, 0)) fib_flush(); } } @@ -494,11 +518,11 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); - rt_cache_flush(2*HZ); + rt_cache_flush(-1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa); - rt_cache_flush(1*HZ); + rt_cache_flush(-1); break; } return NOTIFY_DONE; @@ -520,16 +544,24 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - rt_cache_flush(2*HZ); + rt_cache_flush(-1); break; case NETDEV_DOWN: - if (fib_sync_down(0, dev)) + if (fib_sync_down(0, dev, 0)) fib_flush(); rt_cache_flush(0); + arp_ifdown(dev); break; case NETDEV_UNREGISTER: if (in_dev->ifa_list) printk("About to crash!\n"); + if (fib_sync_down(0, dev, 1)) + fib_flush(); + rt_cache_flush(0); + arp_ifdown(dev); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: rt_cache_flush(0); break; } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index afa6f7fe0..33bcf0321 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -394,6 +394,8 @@ FTprint("fib_create_info err=%d\n", err); && f->fn_tos == tos #endif ) { + struct fib_node **ins_fp; + state = f->fn_state; if (n->nlmsg_flags&NLM_F_EXCL && !(state&FN_S_ZOMBIE)) return -EEXIST; @@ -412,9 +414,12 @@ FTprint("fib_create_info err=%d\n", err); f->fn_state = 0; fib_release_info(old_fi); if (state&FN_S_ACCESSED) - rt_cache_flush(RT_FLUSH_DELAY); + rt_cache_flush(-1); return 0; } + + ins_fp = fp; + for ( ; (f = *fp) != NULL && fn_key_eq(f->fn_key, key) #ifdef CONFIG_IP_ROUTE_TOS && f->fn_tos == tos @@ -428,12 +433,16 @@ FTprint("fib_create_info err=%d\n", err); f->fn_state = 0; rtmsg_fib(RTM_NEWROUTE, f, z, tb->tb_id, n, req); if (state&FN_S_ACCESSED) - rt_cache_flush(RT_FLUSH_DELAY); + rt_cache_flush(-1); return 0; } return -EEXIST; } } + if (!(n->nlmsg_flags&NLM_F_APPEND)) { + fp = ins_fp; + f = *fp; + } } else { if (!(n->nlmsg_flags&NLM_F_CREATE)) return -ENOENT; @@ -459,14 +468,13 @@ FTprint("fib_create_info err=%d\n", err); * Insert new entry to the list. */ - start_bh_atomic(); new_f->fn_next = f; + /* ATOMIC_SET */ *fp = new_f; - end_bh_atomic(); fz->fz_nent++; rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req); - rt_cache_flush(RT_FLUSH_DELAY); + rt_cache_flush(-1); return 0; } @@ -541,7 +549,7 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); if (f->fn_state&FN_S_ACCESSED) { f->fn_state &= ~FN_S_ACCESSED; - rt_cache_flush(RT_FLUSH_DELAY); + rt_cache_flush(-1); } if (++fib_hash_zombies > 128) fib_flush(); @@ -715,7 +723,7 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id, if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos, FIB_INFO(f)) < 0) { - kfree_skb(skb, 0); + kfree_skb(skb); return; } NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index c593d758f..3ffb404b5 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -45,10 +45,14 @@ #define FRprintk(a...) +#ifndef CONFIG_RTNL_OLD_IFINFO +#define RTA_IFNAME RTA_IIF +#endif + struct fib_rule { struct fib_rule *r_next; - unsigned r_preference; + u32 r_preference; unsigned char r_table; unsigned char r_action; unsigned char r_dst_len; @@ -72,19 +76,19 @@ static struct fib_rule *fib_rules = &local_rule; int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { - struct kern_rta *rta = arg; + struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct fib_rule *r, **rp; for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { - if ((!rta->rta_src || memcmp(rta->rta_src, &r->r_src, 4) == 0) && + if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && rtm->rtm_src_len == r->r_src_len && rtm->rtm_dst_len == r->r_dst_len && - (!rta->rta_dst || memcmp(rta->rta_dst, &r->r_dst, 4) == 0) && + (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && rtm->rtm_tos == r->r_tos && rtm->rtm_type == r->r_action && - (!rta->rta_priority || *rta->rta_priority == r->r_preference) && - (!rta->rta_ifname || strcmp(rta->rta_ifname, r->r_ifname) == 0) && + (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && + (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { *rp = r->r_next; if (r != &default_rule && r != &main_rule && r != &local_rule) @@ -110,7 +114,7 @@ static struct fib_table *fib_empty_table(void) int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { - struct kern_rta *rta = arg; + struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct fib_rule *r, *new_r, **rp; unsigned char table_id; @@ -119,6 +123,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (rtm->rtm_tos & ~IPTOS_TOS_MASK)) return -EINVAL; + if (rta[RTA_IFNAME-1] && RTA_PAYLOAD(rta[RTA_IFNAME-1]) > IFNAMSIZ) + return -EINVAL; + table_id = rtm->rtm_table; if (table_id == RT_TABLE_UNSPEC) { struct fib_table *table; @@ -133,12 +140,12 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (!new_r) return -ENOMEM; memset(new_r, 0, sizeof(*new_r)); - if (rta->rta_src) - memcpy(&new_r->r_src, rta->rta_src, 4); - if (rta->rta_dst) - memcpy(&new_r->r_dst, rta->rta_dst, 4); - if (rta->rta_gw) - memcpy(&new_r->r_srcmap, rta->rta_gw, 4); + if (rta[RTA_SRC-1]) + memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_GATEWAY-1]) + memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4); new_r->r_src_len = rtm->rtm_src_len; new_r->r_dst_len = rtm->rtm_dst_len; new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len); @@ -146,14 +153,15 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) new_r->r_tos = rtm->rtm_tos; new_r->r_action = rtm->rtm_type; new_r->r_flags = rtm->rtm_flags; - if (rta->rta_priority) - new_r->r_preference = *rta->rta_priority; + if (rta[RTA_PRIORITY-1]) + memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); new_r->r_table = table_id; - if (rta->rta_ifname) { + if (rta[RTA_IFNAME-1]) { struct device *dev; - memcpy(new_r->r_ifname, rta->rta_ifname, IFNAMSIZ); + memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IFNAME-1]), IFNAMSIZ); + new_r->r_ifname[IFNAMSIZ-1] = 0; new_r->r_ifindex = -1; - dev = dev_get(rta->rta_ifname); + dev = dev_get(new_r->r_ifname); if (dev) new_r->r_ifindex = dev->ifindex; } @@ -314,9 +322,11 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, rtm->rtm_table = r->r_table; rtm->rtm_protocol = 0; rtm->rtm_scope = 0; +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_nhs = 0; - rtm->rtm_type = r->r_action; rtm->rtm_optlen = 0; +#endif + rtm->rtm_type = r->r_action; rtm->rtm_flags = r->r_flags; if (r->r_dst_len) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8f3e70cad..3883fcba0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.5 1997/10/10 22:40:50 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.6 1997/12/13 21:52:49 kuznet Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -120,6 +120,7 @@ extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * for_nexthops(fi) { if (nh->nh_oif != onh->nh_oif || nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif @@ -177,13 +178,38 @@ static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) return 0; } +#ifndef CONFIG_RTNL_OLD_IFINFO +static int +fib_count_nexthops(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while (nhlen >= sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + }; + return nhs; +} +#endif + +#ifdef CONFIG_RTNL_OLD_IFINFO static int fib_get_nhs(struct fib_info *fi, const struct nlmsghdr *nlh, const struct rtmsg *r) { struct rtnexthop *nhp = RTM_RTNH(r); int nhlen = RTM_NHLEN(nlh, r); +#else +static int +fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); +#endif -printk("get nhs %d/%d\n", r->rtm_nhs, nhlen); change_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) @@ -193,8 +219,6 @@ printk("get nhs %d/%d\n", r->rtm_nhs, nhlen); nh->nh_weight = nhp->rtnh_hops + 1; if (attrlen) nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); -printk("Got nh: via %08x dev %d w %d fl %02x\n", nh->nh_gw, nh->nh_oif, - nh->nh_weight, nh->nh_flags); nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); return 0; @@ -218,11 +242,18 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, } #ifdef CONFIG_IP_ROUTE_MULTIPATH +#ifdef CONFIG_RTNL_OLD_IFINFO if (r->rtm_nhs == 0) return 0; nhp = RTM_RTNH(r); nhlen = RTM_NHLEN(nlh, r); +#else + if (rta->rta_mp == NULL) + return 0; + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); +#endif for_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); @@ -354,16 +385,28 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, struct fib_info *fi = NULL; struct fib_info *ofi; #ifdef CONFIG_IP_ROUTE_MULTIPATH +#ifdef CONFIG_RTNL_OLD_IFINFO int nhs = r->rtm_nhs ? : 1; #else + int nhs = 1; +#endif +#else const int nhs = 1; #endif /* Fast check to catch the most weird cases */ - if (fib_props[r->rtm_type].scope > r->rtm_scope) { - printk("Einval 1\n"); + if (fib_props[r->rtm_type].scope > r->rtm_scope) goto err_inval; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH +#ifndef CONFIG_RTNL_OLD_IFINFO + if (rta->rta_mp) { + nhs = fib_count_nexthops(rta->rta_mp); + if (nhs == 0) + goto err_inval; } +#endif +#endif fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); err = -ENOBUFS; @@ -374,18 +417,43 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, fi->fib_protocol = r->rtm_protocol; fi->fib_nhs = nhs; fi->fib_flags = r->rtm_flags; +#ifdef CONFIG_RTNL_OLD_IFINFO if (rta->rta_mtu) fi->fib_mtu = *rta->rta_mtu; if (rta->rta_rtt) fi->fib_rtt = *rta->rta_rtt; if (rta->rta_window) fi->fib_window = *rta->rta_window; +#else + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > FIB_MAX_METRICS) + goto failure; + fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } +#endif if (rta->rta_prefsrc) memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); +#ifndef CONFIG_RTNL_OLD_IFINFO + if (rta->rta_mp) { +#else if (r->rtm_nhs) { +#endif #ifdef CONFIG_IP_ROUTE_MULTIPATH +#ifdef CONFIG_RTNL_OLD_IFINFO if ((err = fib_get_nhs(fi, nlh, r)) != 0) +#else + if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0) +#endif goto failure; if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) goto err_inval; @@ -416,7 +484,11 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, #endif if (fib_props[r->rtm_type].error) { +#ifndef CONFIG_RTNL_OLD_IFINFO + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) +#else if (rta->rta_gw || rta->rta_oif || r->rtm_nhs) +#endif goto err_inval; goto link_it; } @@ -456,6 +528,15 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, link_it: if ((ofi = fib_find_info(fi)) != NULL) { + if (fi->fib_nh[0].nh_scope != ofi->fib_nh[0].nh_scope) { + printk("nh %d/%d gw=%08x/%08x dev=%s/%s\n", + fi->fib_nh[0].nh_scope, + ofi->fib_nh[0].nh_scope, + fi->fib_nh[0].nh_gw, + ofi->fib_nh[0].nh_gw, + fi->fib_nh[0].nh_dev->name, + ofi->fib_nh[0].nh_dev->name); + } kfree(fi); ofi->fib_refcnt++; return ofi; @@ -543,7 +624,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, struct rtmsg *rtm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; +#ifdef CONFIG_RTNL_OLD_IFINFO unsigned char *o; +#endif nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); rtm = NLMSG_DATA(nlh); @@ -555,18 +638,33 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = scope; +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_nhs = 0; o = skb->tail; +#endif if (rtm->rtm_dst_len) RTA_PUT(skb, RTA_DST, 4, dst); rtm->rtm_protocol = fi->fib_protocol; +#ifdef CONFIG_RTNL_OLD_IFINFO if (fi->fib_mtu) RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &fi->fib_mtu); if (fi->fib_window) RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &fi->fib_window); if (fi->fib_rtt) RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt); +#else + if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) { + int i; + struct rtattr *mx = (struct rtattr *)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + for (i=0; i<FIB_MAX_METRICS; i++) { + if (fi->fib_metrics[i]) + RTA_PUT(skb, i+1, sizeof(unsigned), fi->fib_metrics + i); + } + mx->rta_len = skb->tail - (u8*)mx; + } +#endif if (fi->fib_prefsrc) RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); if (fi->fib_nhs == 1) { @@ -575,10 +673,18 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (fi->fib_nh->nh_oif) RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); } +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_optlen = skb->tail - o; +#endif #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { struct rtnexthop *nhp; +#ifndef CONFIG_RTNL_OLD_IFINFO + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0)); +#endif for_nexthops(fi) { if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) goto rtattr_failure; @@ -589,8 +695,14 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (nh->nh_gw) RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); nhp->rtnh_len = skb->tail - (unsigned char*)nhp; +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_nhs++; +#endif } endfor_nexthops(fi); +#ifndef CONFIG_RTNL_OLD_IFINFO + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; +#endif } #endif nlh->nlmsg_len = skb->tail - b; @@ -598,7 +710,7 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, nlmsg_failure: rtattr_failure: - skb_put(skb, b - skb->tail); + skb_trim(skb, b - skb->data); return -1; } @@ -648,10 +760,8 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, nl->nlmsg_flags = 0; } else { nl->nlmsg_type = RTM_NEWROUTE; - nl->nlmsg_flags = NLM_F_CREATE; + nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; rtm->rtm_protocol = RTPROT_BOOT; - if (plen != 0) - nl->nlmsg_flags |= NLM_F_REPLACE; } rtm->rtm_dst_len = plen; @@ -704,7 +814,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; if (r->rt_gateway.sa_family == AF_INET && *ptr) { rta->rta_gw = ptr; - if (r->rt_flags&RTF_GATEWAY) + if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST) rtm->rtm_scope = RT_SCOPE_UNIVERSE; } @@ -714,6 +824,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) return -EINVAL; +#ifdef CONFIG_RTNL_OLD_IFINFO /* Ugly conversion from rtentry types to unsigned */ if (r->rt_flags&RTF_IRTT) { @@ -730,6 +841,10 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, if (sizeof(*rta->rta_mtu) != sizeof(r->rt_mtu)) *rta->rta_mtu = r->rt_mtu; } +#else + if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) + printk(KERN_DEBUG "SIOCRT*: mtu/window/irtt are not implemnted.\n"); +#endif return 0; } @@ -742,9 +857,13 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down(u32 local, struct device *dev) +int fib_sync_down(u32 local, struct device *dev, int force) { int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; for_fib_info() { if (local && fi->fib_prefsrc == local) { @@ -757,7 +876,7 @@ int fib_sync_down(u32 local, struct device *dev) if (nh->nh_flags&RTNH_F_DEAD) dead++; else if (nh->nh_dev == dev && - nh->nh_scope != RT_SCOPE_NOWHERE) { + nh->nh_scope != scope) { nh->nh_flags |= RTNH_F_DEAD; #ifdef CONFIG_IP_ROUTE_MULTIPATH fi->fib_power -= nh->nh_power; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 77d96acf9..b2c7151d1 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@cymru.net> * - * Version: $Id: icmp.c,v 1.3 1997/12/16 05:37:35 ralf Exp $ + * Version: $Id: icmp.c,v 1.4 1998/03/03 01:23:37 ralf Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -877,9 +877,9 @@ static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int l struct in_ifaddr *ifa; u32 mask; - if (!ipv4_config.log_martians || - !IS_ROUTER || - !in_dev || !in_dev->ifa_list || + if (!in_dev || !in_dev->ifa_list || + !IN_DEV_LOG_MARTIANS(in_dev) || + !IN_DEV_FORWARD(in_dev) || len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) return; @@ -1007,7 +1007,7 @@ int icmp_rcv(struct sk_buff *skb, unsigned short len) (icmp_pointers[icmph->type].handler)(icmph, skb, len); drop: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; error: icmp_statistics.IcmpInErrors++; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1c59f5462..166b68b42 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.22 1997/10/29 20:27:24 kuznet Exp $ + * Version: $Id: igmp.c,v 1.3 1997/12/16 05:37:36 ralf Exp $ * * Authors: * Alan Cox <Alan.Cox@linux.org> @@ -117,7 +117,7 @@ * contradict to specs provided this delay is small enough. */ -#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && jiffies - (in_dev)->mr_v1_seen < 0) +#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && (long)(jiffies - (in_dev)->mr_v1_seen) < 0) /* * Timer management @@ -131,19 +131,12 @@ static __inline__ void igmp_stop_timer(struct ip_mc_list *im) } } -extern __inline__ unsigned int random(void) -{ - static unsigned long seed=152L; - seed=seed*69069L+1; - return seed^jiffies; -} - static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv; if (im->tm_running) return; - tv=random() % max_delay; + tv=net_random() % max_delay; im->timer.expires=jiffies+tv+2; im->tm_running=1; add_timer(&im->timer); @@ -186,7 +179,6 @@ static int igmp_send_report(struct device *dev, u32 group, int type) skb->dst = &rt->u.dst; skb_reserve(skb, (dev->hard_header_len+15)&~15); - ip_ll_header(skb); skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); @@ -294,7 +286,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti if (LOCAL_MCAST(im->multiaddr)) continue; im->unsolicit_count = 0; - if (im->tm_running && im->timer.expires-jiffies > max_delay) + if (im->tm_running && (long)(im->timer.expires-jiffies) > max_delay) igmp_stop_timer(im); igmp_start_timer(im, max_delay); } @@ -308,7 +300,7 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len) if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len) || in_dev==NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -336,28 +328,12 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len) default: NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } #endif -/* - * Map a multicast IP onto multicast MAC for type ethernet. - */ - -extern __inline__ void ip_mc_map(u32 addr, char *buf) -{ - addr=ntohl(addr); - buf[0]=0x01; - buf[1]=0x00; - buf[2]=0x5e; - buf[5]=addr&0xFF; - addr>>=8; - buf[4]=addr&0xFF; - addr>>=8; - buf[3]=addr&0x7F; -} /* * Add a filter to a device @@ -365,15 +341,18 @@ extern __inline__ void ip_mc_map(u32 addr, char *buf) static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) { - char buf[6]; + char buf[MAX_ADDR_LEN]; struct device *dev = in_dev->dev; - if (!(dev->flags & IFF_MULTICAST)) - return; - if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) - return; /* Only do ethernet or FDDI for now */ - ip_mc_map(addr, buf); - dev_mc_add(dev,buf,ETH_ALEN,0); + /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. + We will get multicast token leakage, when IFF_MULTICAST + is changed. This check should be done in dev->set_multicast_list + routine. Something sort of: + if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } + --ANK + */ + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_add(dev,buf,dev->addr_len,0); } /* @@ -382,18 +361,19 @@ static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) { - char buf[6]; + char buf[MAX_ADDR_LEN]; struct device *dev = in_dev->dev; - if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) - return; /* Only do ethernet or FDDI for now */ - ip_mc_map(addr,buf); - dev_mc_delete(dev,buf,ETH_ALEN,0); + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_delete(dev,buf,dev->addr_len,0); } static void igmp_group_dropped(struct ip_mc_list *im) { - ip_mc_filter_del(im->interface, im->multiaddr); + if (im->loaded) { + im->loaded = 0; + ip_mc_filter_del(im->interface, im->multiaddr); + } #ifdef CONFIG_IP_MULTICAST if (LOCAL_MCAST(im->multiaddr)) @@ -410,7 +390,10 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { - ip_mc_filter_add(im->interface, im->multiaddr); + if (im->loaded == 0) { + im->loaded = 1; + ip_mc_filter_add(im->interface, im->multiaddr); + } #ifdef CONFIG_IP_MULTICAST if (LOCAL_MCAST(im->multiaddr)) @@ -458,13 +441,13 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) im->timer.function=&igmp_timer_expire; im->unsolicit_count = IGMP_Unsolicited_Report_Count; im->reporter = 0; + im->loaded = 0; #endif im->next=in_dev->mc_list; in_dev->mc_list=im; - if (in_dev->dev->flags & IFF_UP) { - igmp_group_added(im); + igmp_group_added(im); + if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); - } return; } @@ -480,10 +463,9 @@ int ip_mc_dec_group(struct in_device *in_dev, u32 addr) if (i->multiaddr==addr) { if (--i->users == 0) { *ip = i->next; - if (in_dev->dev->flags & IFF_UP) { - igmp_group_dropped(i); + igmp_group_dropped(i); + if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); - } kfree_s(i, sizeof(*i)); } return 0; @@ -500,6 +482,8 @@ void ip_mc_down(struct in_device *in_dev) for (i=in_dev->mc_list; i; i=i->next) igmp_group_dropped(i); + + ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } /* Device going up */ @@ -508,6 +492,8 @@ void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *i; + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + for (i=in_dev->mc_list; i; i=i->next) igmp_group_added(i); } @@ -522,19 +508,11 @@ void ip_mc_destroy_dev(struct in_device *in_dev) while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; + igmp_group_dropped(i); kfree_s(i, sizeof(*i)); } } -/* Initialize multicasting on an IP interface */ - -void ip_mc_init_dev(struct in_device *in_dev) -{ - in_dev->mc_list = NULL; - in_dev->mr_v1_seen = 0; - ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); -} - static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) { struct rtable *rt; @@ -697,9 +675,10 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum begin=pos; } if(pos>offset+length) - break; + goto done; } } +done: *start=buffer+(offset-begin); len-=(offset-begin); if(len>length) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 7010e3a30..45a2ed588 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,7 +5,7 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.2 1997/12/16 05:37:36 ralf Exp $ + * Version: $Id: ip_forward.c,v 1.3 1998/03/03 01:23:37 ralf Exp $ * * Authors: see ip.c * @@ -18,6 +18,7 @@ * use output device for accounting. * Jos Vos : Call forward firewall after routing * (always use output device). + * Mike McLagan : Routing by source */ #include <linux/config.h> @@ -112,7 +113,7 @@ int ip_forward(struct sk_buff *skb) if (ip_decrease_ttl(iph) <= 0) goto too_many_hops; - if (opt->is_strictroute && (rt->rt_flags&RTF_GATEWAY)) + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; /* @@ -141,51 +142,46 @@ int ip_forward(struct sk_buff *skb) * If the indicated interface is up and running, kick it. */ - if (dev2->flags & IFF_UP) { - if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) - goto frag_needed; + if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) + goto frag_needed; #ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags & RTCF_NAT) { - if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { - struct sk_buff *skb2; - skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); - kfree_skb(skb, FREE_WRITE); - skb = skb2; - } - if (ip_do_nat(skb)) { - kfree_skb(skb, FREE_WRITE); + if (rt->rt_flags & RTCF_NAT) { + if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { + struct sk_buff *skb2; + skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); + kfree_skb(skb); + if (skb2 == NULL) return -1; - } + skb = skb2; } + if (ip_do_nat(skb)) { + kfree_skb(skb); + return -1; + } + } #endif #ifdef CONFIG_IP_MASQUERADE - if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) { - - if (rt->rt_flags&RTCF_VALVE) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PKT_FILTERED, 0); - kfree_skb(skb, FREE_READ); - return -1; - } - - /* - * Check that any ICMP packets are not for a - * masqueraded connection. If so rewrite them - * and skip the firewall checks - */ - if (iph->protocol == IPPROTO_ICMP) { - __u32 maddr; + if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) { + /* + * Check that any ICMP packets are not for a + * masqueraded connection. If so rewrite them + * and skip the firewall checks + */ + if (iph->protocol == IPPROTO_ICMP) { + __u32 maddr; #ifdef CONFIG_IP_MASQUERADE_ICMP -#define icmph ((struct icmphdr *)((char *)iph + (iph->ihl<<2))) - if ((icmph->type==ICMP_DEST_UNREACH)|| - (icmph->type==ICMP_SOURCE_QUENCH)|| - (icmph->type==ICMP_TIME_EXCEEDED)) - { + struct icmphdr *icmph = (struct icmphdr *)((char*)iph + (iph->ihl << 2)); + if ((icmph->type==ICMP_DEST_UNREACH)|| + (icmph->type==ICMP_SOURCE_QUENCH)|| + (icmph->type==ICMP_TIME_EXCEEDED)) + { #endif maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); - if (fw_res = ip_fw_masq_icmp(&skb, maddr) < 0) { - kfree_skb(skb, FREE_READ); + fw_res = ip_fw_masq_icmp(&skb, maddr); + if (fw_res < 0) { + kfree_skb(skb); return -1; } @@ -195,9 +191,9 @@ int ip_forward(struct sk_buff *skb) #ifdef CONFIG_IP_MASQUERADE_ICMP } #endif - } - if (rt->rt_flags&RTCF_MASQ) - goto skip_call_fw_firewall; + } + if (rt->rt_flags&RTCF_MASQ) + goto skip_call_fw_firewall; #endif /* CONFIG_IP_MASQUERADE */ #ifdef CONFIG_FIREWALL @@ -210,32 +206,32 @@ int ip_forward(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); /* fall thru */ default: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -1; } #endif #ifdef CONFIG_IP_MASQUERADE - } + } skip_call_fw_firewall: - /* - * If this fragment needs masquerading, make it so... - * (Don't masquerade de-masqueraded fragments) - */ - if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) && - (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) { - u32 maddr; + /* + * If this fragment needs masquerading, make it so... + * (Don't masquerade de-masqueraded fragments) + */ + if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) && + (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) { + u32 maddr; #ifdef CONFIG_IP_ROUTE_NAT - maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0; + maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0; - if (maddr == 0) + if (maddr == 0) #endif maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (ip_fw_masquerade(&skb, maddr) < 0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -1; } else { /* @@ -244,48 +240,55 @@ skip_call_fw_firewall: iph = skb->nh.iph; opt = &(IPCB(skb)->opt); } - } + } #endif - if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { - struct sk_buff *skb2; - skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); - kfree_skb(skb, FREE_WRITE); + if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { + struct sk_buff *skb2; + skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); + kfree_skb(skb); - if (skb2 == NULL) { - NETDEBUG(printk(KERN_ERR "\nIP: No memory available for IP forward\n")); - return -1; - } - skb = skb2; - iph = skb2->nh.iph; + if (skb2 == NULL) { + NETDEBUG(printk(KERN_ERR "\nIP: No memory available for IP forward\n")); + return -1; } + skb = skb2; + iph = skb2->nh.iph; + } #ifdef CONFIG_FIREWALL - if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) { - /* FW_ACCEPT and FW_MASQUERADE are treated equal: - masquerading is only supported via forward rules */ - if (fw_res == FW_REJECT) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - kfree_skb(skb,FREE_WRITE); - return -1; - } + if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) { + /* FW_ACCEPT and FW_MASQUERADE are treated equal: + masquerading is only supported via forward rules */ + if (fw_res == FW_REJECT) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + kfree_skb(skb); + return -1; + } #endif - ip_statistics.IpForwDatagrams++; + ip_statistics.IpForwDatagrams++; - if (opt->optlen == 0) { - ip_send(skb); - return 0; + if (opt->optlen == 0) { +#ifdef CONFIG_NET_FASTROUTE + if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) { + unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK; + /* Time to switch to functional programming :-) */ + dst_release(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst))); } - ip_forward_options(skb); +#endif ip_send(skb); + return 0; } + + ip_forward_options(skb); + ip_send(skb); return 0; #ifdef CONFIG_TRANSPARENT_PROXY local_pkt: -#endif return ip_local_deliver(skb); +#endif frag_needed: ip_statistics.IpFragFails++; @@ -303,6 +306,6 @@ too_many_hops: /* Tell the sender its packet died... */ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); return -1; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 637fe022e..9dccb5324 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.29 1997/11/22 12:31:05 freitag Exp $ + * Version: $Id: ip_fragment.c,v 1.30 1997/12/29 19:52:32 kuznet Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -15,6 +15,7 @@ * David S. Miller : Begin massive cleanup... * Andi Kleen : Add sysctls. * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. */ #include <linux/types.h> @@ -32,7 +33,6 @@ #include <linux/inet.h> #include <linux/firewall.h> #include <linux/ip_fw.h> -#include <net/checksum.h> /* Fragment cache limits. We will commit 256K at one time. Should we * cross that limit we will prune down to 192K. This should cope with @@ -79,10 +79,10 @@ atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ char *in_ntoa(__u32 in); /* Memory Tracking Functions. */ -extern __inline__ void frag_kfree_skb(struct sk_buff *skb, int type) +extern __inline__ void frag_kfree_skb(struct sk_buff *skb) { atomic_sub(skb->truesize, &ip_frag_mem); - kfree_skb(skb,type); + kfree_skb(skb); } extern __inline__ void frag_kfree_s(void *ptr, int len) @@ -176,7 +176,7 @@ static void ip_free(struct ipq *qp) while (fp) { struct ipfrag *xp = fp->next; - frag_kfree_skb(fp->skb,FREE_READ); + frag_kfree_skb(fp->skb); frag_kfree_s(fp, sizeof(struct ipfrag)); fp = xp; } @@ -193,6 +193,15 @@ static void ip_expire(unsigned long arg) { struct ipq *qp = (struct ipq *) arg; + if(!qp->fragments) + { +#ifdef IP_EXPIRE_DEBUG + printk("warning: possible ip-expire attack\n"); +#endif + ip_free(qp); + return; + } + /* Send an ICMP "Fragment Reassembly Timeout" message. */ ip_statistics.IpReasmTimeout++; ip_statistics.IpReasmFails++; @@ -254,6 +263,7 @@ static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph) qp->dev = skb->dev; /* Start a timer for this entry. */ + init_timer(&qp->timer); qp->timer.expires = jiffies + sysctl_ipfrag_time; /* about 30 seconds */ qp->timer.data = (unsigned long) qp; /* pointer to queue */ qp->timer.function = ip_expire; /* expire function */ @@ -345,7 +355,7 @@ static struct sk_buff *ip_glue(struct ipq *qp) NETDEBUG(printk(KERN_ERR "Invalid fragment list: " "Fragment over size.\n")); ip_free(qp); - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); ip_statistics.IpReasmFails++; return NULL; } @@ -428,7 +438,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) } else { /* If we failed to create it, then discard the frame. */ if ((qp = ip_create(skb, iph)) == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); ip_statistics.IpReasmFails++; return NULL; } @@ -438,7 +448,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) if(ntohs(iph->tot_len)+(int)offset>65535) { if (net_ratelimit()) printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", NIPQUAD(iph->saddr)); - frag_kfree_skb(skb, FREE_READ); + frag_kfree_skb(skb); ip_statistics.IpReasmFails++; return NULL; } @@ -502,7 +512,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) /* We have killed the original next frame. */ next = tfp; - frag_kfree_skb(tmp->skb,FREE_READ); + frag_kfree_skb(tmp->skb); frag_kfree_s(tmp, sizeof(struct ipfrag)); } } @@ -513,7 +523,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) /* No memory to save the fragment - so throw the lot. */ if (!tfp) { - frag_kfree_skb(skb, FREE_READ); + frag_kfree_skb(skb); return NULL; } tfp->prev = prev; diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 9f8123afd..d78aa0f66 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -6,7 +6,7 @@ * license in recognition of the original copyright. * -- Alan Cox. * - * $Id: ip_fw.c,v 1.29 1997/10/10 22:41:01 davem Exp $ + * $Id: ip_fw.c,v 1.3 1997/12/16 05:37:37 ralf Exp $ * * Ported from BSD to Linux, * Alan Cox 22/Nov/1994. @@ -90,7 +90,6 @@ #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> -#include <linux/config.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -152,9 +151,12 @@ struct ip_fw *ip_fw_fwd_chain; struct ip_fw *ip_fw_in_chain; struct ip_fw *ip_fw_out_chain; struct ip_fw *ip_acct_chain; +struct ip_fw *ip_masq_chain; static struct ip_fw **chains[] = - {&ip_fw_fwd_chain, &ip_fw_in_chain, &ip_fw_out_chain, &ip_acct_chain}; + {&ip_fw_fwd_chain, &ip_fw_in_chain, &ip_fw_out_chain, &ip_acct_chain, + &ip_masq_chain + }; #endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */ #ifdef CONFIG_IP_FIREWALL @@ -578,7 +580,7 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_ skb_put(skb,len); memcpy(skb->data,ip,len); if(netlink_post(NETLINK_FIREWALL, skb)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index dbd62e27e..04fde6120 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -452,7 +452,7 @@ void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) /* Try to guess incoming interface */ if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } skb2->dev = rt->u.dst.dev; @@ -464,14 +464,14 @@ void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb2->dst->dev->type != ARPHRD_IPGRE) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } } @@ -479,7 +479,7 @@ void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { if (rel_info > skb2->dst->pmtu) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } skb2->dst->pmtu = rel_info; @@ -493,7 +493,7 @@ void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) } icmp_send(skb2, rel_type, rel_code, rel_info); - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); #endif } @@ -554,7 +554,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len) } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || - (tunnel->i_seqno && seqno - tunnel->i_seqno < 0)) { + (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { tunnel->stat.rx_fifo_errors++; tunnel->stat.rx_errors++; goto drop; @@ -572,7 +572,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); drop: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return(0); } @@ -622,12 +622,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { struct in6_addr *addr6; int addr_type; - struct nd_neigh *neigh = (struct nd_neigh *) skb->dst->neighbour; + struct neighbour *neigh = skb->dst->neighbour; if (neigh == NULL) goto tx_error; - addr6 = &neigh->ndn_addr; + addr6 = (struct in6_addr*)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -704,12 +704,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { tunnel->err_count--; - if (skb->protocol == __constant_htons(ETH_P_IP)) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); -#ifdef CONFIG_IPV6 - else if (skb->protocol == __constant_htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); -#endif + dst_link_failure(skb); } else tunnel->err_count = 0; } @@ -723,11 +718,11 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (!new_skb) { ip_rt_put(rt); stats->tx_dropped++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); skb = new_skb; } @@ -792,16 +787,11 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) return 0; tx_error_icmp: - if (skb->protocol == __constant_htons(ETH_P_IP)) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); -#ifdef CONFIG_IPV6 - else if (skb->protocol == __constant_htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); -#endif + dst_link_failure(skb); tx_error: stats->tx_errors++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } @@ -962,28 +952,6 @@ static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short return -t->hlen; } -static int ipgre_rebuild_header(struct sk_buff *skb) -{ - struct device *dev = skb->dev; - struct iphdr *iph = (struct iphdr *)skb->data; - u16 *p = (u16*)(iph + 1); - struct neighbour *neigh = NULL; - - if (skb->dst) - neigh = skb->dst->neighbour; - - if (neigh) - return neigh->ops->resolve((void*)&iph->daddr, skb); - - if (p[1] == __constant_htons(ETH_P_IP)) - return arp_find((void*)&iph->daddr, skb); - - if (net_ratelimit()) - printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n", - dev->name, (int)p[1]); - return 0; -} - static int ipgre_open(struct device *dev) { struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; @@ -1076,7 +1044,6 @@ static int ipgre_tunnel_init(struct device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->hard_header = ipgre_header; - dev->rebuild_header = ipgre_rebuild_header; dev->open = ipgre_open; dev->stop = ipgre_close; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1c3c2da7a..61c364542 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.24 1997/10/24 17:15:58 kuznet Exp $ + * Version: $Id: ip_input.c,v 1.2 1997/12/16 05:37:38 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -97,6 +97,7 @@ * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support + * Mike McLagan : Routing by source * * * @@ -257,7 +258,7 @@ int ip_local_deliver(struct sk_buff *skb) { int ret = ip_fw_demasquerade(&skb); if (ret < 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -267,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb) dst_release(skb->dst); skb->dst = NULL; if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } return skb->dst->input(skb); @@ -312,7 +313,7 @@ int ip_local_deliver(struct sk_buff *skb) if(ipsec_sk_policy(raw_sk,skb1)) raw_rcv(raw_sk, skb1); else - kfree_skb(skb1, FREE_WRITE); + kfree_skb(skb1); } } raw_sk = sknext; @@ -375,12 +376,12 @@ int ip_local_deliver(struct sk_buff *skb) if(ipsec_sk_policy(raw_sk, skb)) raw_rcv(raw_sk, skb); else - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } else if (!flag) /* Free and report errors */ { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } return(0); @@ -422,7 +423,9 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) */ if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 +#ifndef CONFIG_IP_ROUTER || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0 +#endif || skb->len < ntohs(iph->tot_len)) goto inhdr_error; @@ -462,18 +465,18 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) opt = &(IPCB(skb)->opt); if (opt->srr) { - if (!ipv4_config.source_route) { - if (ipv4_config.log_martians && net_ratelimit()) + struct in_device *in_dev = dev->ip_ptr; + if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "source route option %08lx -> %08lx\n", ntohl(iph->saddr), ntohl(iph->daddr)); goto drop; } - if (((struct rtable*)skb->dst)->rt_type == RTN_LOCAL && - ip_options_rcv_srr(skb)) + if (ip_options_rcv_srr(skb)) goto drop; } } - + /* * See if the firewall wants to dispose of the packet. */ @@ -501,7 +504,7 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) inhdr_error: ip_statistics.IpInHdrErrors++; drop: - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return(0); } diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c index 814da2aa8..8772bd58c 100644 --- a/net/ipv4/ip_masq_app.c +++ b/net/ipv4/ip_masq_app.c @@ -569,7 +569,7 @@ static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, i * preferably inplace */ - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } return n_skb; } diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c index 5313f4429..1d8edb253 100644 --- a/net/ipv4/ip_masq_ftp.c +++ b/net/ipv4/ip_masq_ftp.c @@ -37,7 +37,6 @@ * */ -#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c index 6668efdaf..c13ca6e9a 100644 --- a/net/ipv4/ip_masq_irc.c +++ b/net/ipv4/ip_masq_irc.c @@ -40,7 +40,6 @@ * */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c index fb0978175..165dd6bd5 100644 --- a/net/ipv4/ip_masq_quake.c +++ b/net/ipv4/ip_masq_quake.c @@ -21,7 +21,6 @@ * */ -#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index d68be7555..f7e28f21a 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -62,7 +62,6 @@ * */ -#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 14b423f2f..53c680eed 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.12 1997/10/10 22:41:08 davem Exp $ + * Version: $Id: ip_options.c,v 1.2 1997/12/16 05:37:40 ralf Exp $ * * Authors: A.N.Kuznetsov * @@ -452,7 +452,7 @@ eol: error: if (skb) { icmp_send(skb, ICMP_PARAMETERPROB, 0, pp_ptr-iph); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4ed7f7638..ac4ac22ae 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.3 1997/12/16 05:37:41 ralf Exp $ + * Version: $Id: ip_output.c,v 1.4 1998/03/03 01:23:41 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -26,9 +26,11 @@ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by * output firewall rules) + * Mike McLagan : Routing by source * Alexey Kuznetsov: use new route cache * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. */ #include <asm/uaccess.h> @@ -76,13 +78,6 @@ int sysctl_ip_dynaddr = 0; -static void __inline__ ip_ll_header_reserve(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable*)skb->dst; - skb_reserve(skb, (rt->u.dst.dev->hard_header_len+15)&~15); - ip_ll_header(skb); -} - int ip_id_count = 0; @@ -98,26 +93,22 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, daddr = opt->faddr; err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), sk->bound_dev_if); + RTO_CONN | sk->localroute, sk->bound_dev_if); if (err) { ip_statistics.IpOutNoRoutes++; return err; } - if (opt && opt->is_strictroute && rt->rt_flags&RTF_GATEWAY) { + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); ip_statistics.IpOutNoRoutes++; return -ENETUNREACH; } skb->dst = dst_clone(&rt->u.dst); + skb_reserve(skb, (rt->u.dst.dev->hard_header_len+15)&~15); - skb->dev = rt->u.dst.dev; - skb->arp = 0; - - ip_ll_header_reserve(skb); - /* * Now build the IP header. */ @@ -136,7 +127,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTCF_NOPMTUDISC)) + !(rt->u.dst.mxlock&(1<<RTAX_MTU))) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -178,13 +169,13 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) sk->dst_cache = NULL; ip_rt_put(rt); err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), sk->bound_dev_if); + RTO_CONN | sk->localroute, sk->bound_dev_if); if (err) return err; sk->dst_cache = &rt->u.dst; } - if (opt && opt->is_strictroute && rt->rt_flags&RTF_GATEWAY) { + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { sk->dst_cache = NULL; ip_rt_put(rt); ip_statistics.IpOutNoRoutes++; @@ -192,11 +183,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) } skb->dst = dst_clone(sk->dst_cache); - - skb->dev = rt->u.dst.dev; - skb->arp = 0; skb_reserve(skb, MAX_HEADER); - skb->mac.raw = skb->data; /* * Now build the IP header. @@ -216,7 +203,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTCF_NOPMTUDISC)) + !(rt->u.dst.mxlock&(1<<RTAX_MTU))) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -234,6 +221,11 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) return 0; } +int __ip_finish_output(struct sk_buff *skb) +{ + return ip_finish_output(skb); +} + int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; @@ -245,14 +237,14 @@ int ip_mc_output(struct sk_buff *skb) */ ip_statistics.IpOutRequests++; -#ifdef CONFIG_IP_ACCT - ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); -#endif #ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags & RTCF_NAT) ip_do_nat(skb); #endif + skb->dev = dev; + skb->protocol = __constant_htons(ETH_P_IP); + /* * Multicasts are looped back for other local users */ @@ -279,9 +271,9 @@ int ip_mc_output(struct sk_buff *skb) dev_loopback_xmit(skb); /* Multicasts with ttl 0 must not go beyond the host */ - + if (skb->nh.iph->ttl == 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } } @@ -296,44 +288,23 @@ int ip_mc_output(struct sk_buff *skb) dev_loopback_xmit(skb); } - if (dev->flags & IFF_UP) { - dev_queue_xmit(skb); - return 0; - } - ip_statistics.IpOutDiscards++; - - kfree_skb(skb, FREE_WRITE); - return -ENETDOWN; + return ip_finish_output(skb); } int ip_output(struct sk_buff *skb) { +#ifdef CONFIG_IP_ROUTE_NAT struct rtable *rt = (struct rtable*)skb->dst; - struct device *dev = rt->u.dst.dev; +#endif - /* - * If the indicated interface is up and running, send the packet. - */ - ip_statistics.IpOutRequests++; -#ifdef CONFIG_IP_ACCT - ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); -#endif - #ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags&RTCF_NAT) ip_do_nat(skb); #endif - if (dev->flags & IFF_UP) { - dev_queue_xmit(skb); - return 0; - } - ip_statistics.IpOutDiscards++; - - kfree_skb(skb, FREE_WRITE); - return -ENETDOWN; + return ip_finish_output(skb); } #ifdef CONFIG_IP_ACCT @@ -349,7 +320,7 @@ int ip_acct_output(struct sk_buff *skb) return 0; } -#endif +#endif /* * Generate a checksum for an outgoing IP datagram. @@ -364,12 +335,9 @@ void ip_send_check(struct iphdr *iph) /* - * Queues a packet to be sent, and starts the transmitter - * if necessary. if free = 1 then we free the block after - * transmit, otherwise we don't. If free==2 we not only - * free the block but also don't assign a new ip seq number. - * This routine also needs to put in the total length, - * and compute the checksum + * Queues a packet to be sent, and starts the transmitter if necessary. + * This routine also needs to put in the total length and compute the + * checksum */ void ip_queue_xmit(struct sk_buff *skb) @@ -380,26 +348,29 @@ void ip_queue_xmit(struct sk_buff *skb) unsigned int tot_len; struct iphdr *iph = skb->nh.iph; - /* - * Discard the surplus MAC header - */ - - skb_pull(skb, skb->nh.raw - skb->data); tot_len = skb->len; - iph->tot_len = htons(tot_len); iph->id = htons(ip_id_count++); - if (rt->u.dst.obsolete) - goto check_route; -after_check_route: + if (rt->u.dst.obsolete) { + /* Ugly... ugly... but what can I do? + Essentially it is "ip_reroute_output" function. --ANK + */ + struct rtable *nrt; + if (ip_route_output(&nrt, rt->key.dst, rt->key.src, + rt->key.tos | RTO_CONN, + sk?sk->bound_dev_if:0)) + goto drop; + skb->dst = &nrt->u.dst; + ip_rt_put(rt); + rt = nrt; + } + dev = rt->u.dst.dev; - if (call_out_firewall(PF_INET, dev, iph, NULL,&skb) < FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); - return; - } - + if (call_out_firewall(PF_INET, dev, iph, NULL,&skb) < FW_ACCEPT) + goto drop; + #ifdef CONFIG_NET_SECURITY /* * Add an IP checksum (must do this before SECurity because @@ -409,11 +380,8 @@ after_check_route: ip_send_check(iph); if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb)<FW_ACCEPT) - { - kfree_skb(skb, FREE_WRITE); - return; - } - + goto drop; + iph = skb->nh.iph; /* don't update tot_len, as the dev->mtu is already decreased */ #endif @@ -426,16 +394,13 @@ after_check_route: * and if (uh...) TCP had segments queued on this route... */ skb2 = skb_realloc_headroom(skb, (dev->hard_header_len+15)&~15); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (skb2 == NULL) return; skb = skb2; iph = skb->nh.iph; } - ip_ll_header(skb); - - /* * Do we need to fragment. Again this is inefficient. * We need to somehow lock the original buffer and use @@ -445,52 +410,35 @@ after_check_route: if (tot_len > rt->u.dst.pmtu) goto fragment; +#ifndef CONFIG_NET_SECURITY /* * Add an IP checksum */ ip_send_check(iph); +#endif if (sk) skb->priority = sk->priority; skb->dst->output(skb); return; -check_route: - /* Ugly... ugly... but what can I do? - - Essentially it is "ip_reroute_output" function. --ANK - */ - { - struct rtable *nrt; - if (ip_route_output(&nrt, rt->key.dst, rt->key.src, rt->key.tos, sk?sk->bound_dev_if:0)) { - kfree_skb(skb, 0); - return; - } - skb->dst = &nrt->u.dst; - ip_rt_put(rt); - rt = nrt; - } - goto after_check_route; - fragment: if ((iph->frag_off & htons(IP_DF))) { printk(KERN_DEBUG "sending pkt_too_big to self\n"); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(rt->u.dst.pmtu)); - - kfree_skb(skb, FREE_WRITE); - return; + goto drop; } - ip_fragment(skb, 1, skb->dst->output); - + ip_fragment(skb, skb->dst->output); + return; +drop: + kfree_skb(skb); } - - /* * Build and send a packet, with as little as one copy * @@ -509,7 +457,6 @@ fragment: * the source IP address (may depend on the routing table), the * destination address (char *), the offset to copy from, and the * length to be copied. - * */ int ip_build_xmit(struct sock *sk, @@ -518,7 +465,7 @@ int ip_build_xmit(struct sock *sk, unsigned int, unsigned int), const void *frag, - unsigned short length, + unsigned length, struct ipcm_cookie *ipc, struct rtable *rt, int flags) @@ -528,7 +475,7 @@ int ip_build_xmit(struct sock *sk, int offset, mf; unsigned short id; struct iphdr *iph; - int hh_len = rt->u.dst.dev->hard_header_len; + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; int nfrags=0; struct ip_options *opt = ipc->opt; int df = htons(IP_DF); @@ -537,7 +484,7 @@ int ip_build_xmit(struct sock *sk, #endif if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - rt->rt_flags&RTCF_NOPMTUDISC) + (rt->u.dst.mxlock&(1<<RTAX_MTU))) df = 0; @@ -551,7 +498,7 @@ int ip_build_xmit(struct sock *sk, if (length <= rt->u.dst.pmtu && opt == NULL) { int error; - struct sk_buff *skb=sock_alloc_send_skb(sk, length+15+hh_len, + struct sk_buff *skb=sock_alloc_send_skb(sk, length+hh_len+15, 0, flags&MSG_DONTWAIT, &error); if(skb==NULL) { ip_statistics.IpOutDiscards++; @@ -561,8 +508,7 @@ int ip_build_xmit(struct sock *sk, skb->when=jiffies; skb->priority = sk->priority; skb->dst = dst_clone(&rt->u.dst); - - ip_ll_header_reserve(skb); + skb_reserve(skb, hh_len); skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); @@ -592,12 +538,12 @@ int ip_build_xmit(struct sock *sk, if (err) err = -EFAULT; - if(!err && call_out_firewall(PF_INET, skb->dev, iph, NULL, &skb) < FW_ACCEPT) + if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) err = -EPERM; #ifdef CONFIG_NET_SECURITY if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 5, &skb))<FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (fw_res != FW_QUEUE) return -EPERM; else @@ -607,7 +553,7 @@ int ip_build_xmit(struct sock *sk, if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } @@ -618,12 +564,10 @@ int ip_build_xmit(struct sock *sk, length -= sizeof(struct iphdr); if (opt) { - fragheaderlen = hh_len + sizeof(struct iphdr) + opt->optlen; + fragheaderlen = sizeof(struct iphdr) + opt->optlen; maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; } else { - fragheaderlen = hh_len; - if(!sk->ip_hdrincl) - fragheaderlen += sizeof(struct iphdr); + fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr); /* * Fragheaderlen is the size of 'overhead' on each buffer. Now work @@ -633,6 +577,9 @@ int ip_build_xmit(struct sock *sk, maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; } + if (length + fragheaderlen > 0xFFFF) + return -EMSGSIZE; + /* * Start at the end of the frame by handling the remainder. */ @@ -658,11 +605,12 @@ int ip_build_xmit(struct sock *sk, mf = 0; /* - * Can't fragment raw packets + * Don't fragment packets for path mtu discovery. */ - if (offset > 0 && df) + if (offset > 0 && df) { return(-EMSGSIZE); + } /* * Lock the device lists. @@ -689,7 +637,7 @@ int ip_build_xmit(struct sock *sk, * Get the memory we require with some space left for alignment. */ - skb = sock_alloc_send_skb(sk, fraglen+15, 0, flags&MSG_DONTWAIT, &error); + skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &error); if (skb == NULL) { ip_statistics.IpOutDiscards++; if(nfrags>1) @@ -705,14 +653,13 @@ int ip_build_xmit(struct sock *sk, skb->when = jiffies; skb->priority = sk->priority; skb->dst = dst_clone(&rt->u.dst); - - ip_ll_header_reserve(skb); + skb_reserve(skb, hh_len); /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen-hh_len); + data = skb_put(skb, fraglen); skb->nh.iph = iph = (struct iphdr *)data; /* @@ -762,7 +709,7 @@ int ip_build_xmit(struct sock *sk, * Account for the fragment. */ - if(!err && !offset && call_out_firewall(PF_INET, skb->dev, iph, NULL, &skb) < FW_ACCEPT) + if(!err && !offset && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) err = -EPERM; #ifdef CONFIG_NET_SECURITY if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 6, &skb))<FW_ACCEPT) @@ -773,7 +720,7 @@ int ip_build_xmit(struct sock *sk, #endif if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); dev_unlock_list(); return err; } @@ -800,17 +747,14 @@ int ip_build_xmit(struct sock *sk, /* * This IP datagram is too large to be sent in one piece. Break it up into - * smaller pieces (each of size equal to the MAC header plus IP header plus + * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. * - * Assumption: packet was ready for transmission, link layer header - * is already in. - * * Yes this is inefficient, feel free to submit a quicker one. */ - -void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) + +void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) { struct iphdr *iph; unsigned char *raw; @@ -823,14 +767,14 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) u16 dont_fragment; struct rtable *rt = (struct rtable*)skb->dst; - dev = skb->dev; + dev = rt->u.dst.dev; /* * Point into the IP datagram header. */ - raw = skb->data; - iph = skb->nh.iph; + raw = skb->nh.raw; + iph = (struct iphdr*)raw; /* * Setup starting values. @@ -838,11 +782,7 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; left = ntohs(iph->tot_len) - hlen; /* Space per frame */ - hlen += skb->nh.raw - raw; - if (local) - mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ - else - mtu = dev->mtu - hlen; + mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ ptr = raw + hlen; /* Where to start from */ /* @@ -853,7 +793,7 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) if (mtu<8) { ip_statistics.IpFragFails++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } @@ -891,10 +831,10 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) * Allocate buffer. */ - if ((skb2 = alloc_skb(len+hlen+15,GFP_ATOMIC)) == NULL) { + if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); ip_statistics.IpFragFails++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } @@ -902,15 +842,13 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) * Set up data on packet */ - skb2->arp = skb->arp; - skb2->dev = skb->dev; skb2->when = skb->when; skb2->pkt_type = skb->pkt_type; skb2->priority = skb->priority; + skb_reserve(skb2, (dev->hard_header_len+15)&~15); skb_put(skb2, len + hlen); - skb2->mac.raw = (char *) skb2->data; - skb2->nh.raw = skb2->mac.raw + dev->hard_header_len; - skb2->h.raw = skb2->mac.raw + hlen; + skb2->nh.raw = skb2->data; + skb2->h.raw = skb2->data + hlen; /* * Charge the memory for the fragment to any owner @@ -925,7 +863,7 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) * Copy the packet header into the new buffer. */ - memcpy(skb2->mac.raw, raw, hlen); + memcpy(skb2->nh.raw, raw, hlen); /* * Copy a block of the IP datagram. @@ -963,13 +901,13 @@ void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*)) ip_statistics.IpFragCreates++; - iph->tot_len = htons(len + hlen - dev->hard_header_len); + iph->tot_len = htons(len + hlen); ip_send_check(iph); output(skb2); } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); ip_statistics.IpFragOKs++; } @@ -1008,8 +946,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) reply->priority = skb->priority; reply->dst = &rt->u.dst; - - ip_ll_header_reserve(reply); + skb_reserve(reply, (rt->u.dst.dev->hard_header_len+15)&~15); /* * Now build the IP header. diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2fd2b16ab..a500a72e5 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.3 1997/12/16 05:37:41 ralf Exp $ + * Version: $Id: ip_sockglue.c,v 1.4 1998/03/03 01:23:41 ralf Exp $ * * Authors: see ip.c * @@ -14,6 +14,7 @@ * Martin Mares : TOS setting fixed. * Alan Cox : Fixed a couple of oopses in Martin's * TOS tweaks. + * Mike McLagan : Routing by source */ #include <linux/config.h> @@ -32,7 +33,6 @@ #include <linux/igmp.h> #include <linux/firewall.h> #include <linux/ip_fw.h> -#include <net/checksum.h> #include <linux/route.h> #include <linux/mroute.h> #include <net/route.h> @@ -314,14 +314,9 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && !suser()) return -EPERM; if (sk->ip_tos != val) { - start_bh_atomic(); sk->ip_tos=val; sk->priority = rt_tos2priority(val); - if (sk->dst_cache) { - dst_release(sk->dst_cache); - sk->dst_cache = NULL; - } - end_bh_atomic(); + dst_release(xchg(&sk->dst_cache, NULL)); } sk->priority = rt_tos2priority(val); return 0; @@ -352,7 +347,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt struct sk_buff *skb; /* Drain queued errors */ while((skb=skb_dequeue(&sk->error_queue))!=NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } sk->ip_recverr = val?1:0; release_sock(sk); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 30df2360d..20521e643 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,5 +1,5 @@ /* - * $Id: ipconfig.c,v 1.5 1997/10/27 16:08:02 mj Exp $ + * $Id: ipconfig.c,v 1.6 1998/01/09 17:19:46 mj Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. @@ -350,7 +350,7 @@ ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)) /* And throw the packet out... */ drop: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -868,6 +868,9 @@ __initfunc(static void ic_bootp_recv(void)) } } } + + if (ic_gateway == INADDR_NONE && b->relay_ip) + ic_gateway = b->relay_ip; } #endif diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 565116ffc..949661f41 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.19 1997/11/08 17:50:21 kuznet Exp $ + * Version: $Id: ipip.c,v 1.4 1997/12/16 05:37:42 ralf Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -93,7 +93,6 @@ */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> @@ -384,7 +383,7 @@ void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) /* Try to guess incoming interface */ if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } skb2->dev = rt->u.dst.dev; @@ -396,14 +395,14 @@ void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb2->dst->dev->type != ARPHRD_IPGRE) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } } @@ -411,7 +410,7 @@ void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { if (rel_info > skb2->dst->pmtu) { - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; } skb2->dst->pmtu = rel_info; @@ -425,7 +424,7 @@ void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) } icmp_send(skb2, rel_type, rel_code, rel_info); - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; #endif } @@ -454,7 +453,7 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len) } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -531,7 +530,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (tunnel->err_count > 0) { if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { tunnel->err_count--; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + dst_link_failure(skb); } else tunnel->err_count = 0; } @@ -548,11 +547,11 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (!new_skb) { ip_rt_put(rt); stats->tx_dropped++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); skb = new_skb; } @@ -588,10 +587,10 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) return 0; tx_error_icmp: - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + dst_link_failure(skb); tx_error: stats->tx_errors++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9909f32b0..d3c07dca3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,12 +9,13 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.28 1997/10/30 00:43:16 davem Exp $ + * Version: $Id: ipmr.c,v 1.29 1997/12/13 21:52:55 kuznet Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code * Alan Cox : Fixed the clone/copy bug and device race. + * Mike McLagan : Routing by source * Malcolm Beattie : Buffer handling fixes. * Alexey Kuznetsov : Double buffer free and other fixes. * SVR Anand : Fixed several multicast bugs and problems. @@ -113,6 +114,7 @@ struct device *ipmr_new_tunnel(struct vifctl *v) in_dev = dev->ip_ptr; if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) goto failure; + in_dev->cnf.rp_filter = 0; if (dev_open(dev)) goto failure; @@ -135,7 +137,7 @@ static struct device * reg_dev; static int reg_vif_xmit(struct sk_buff *skb, struct device *dev) { ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -176,10 +178,13 @@ struct device *ipmr_reg_vif(struct vifctl *v) kfree(dev); return NULL; } + dev->iflink = 0; if ((in_dev = inetdev_init(dev)) == NULL) goto failure; + in_dev->cnf.rp_filter = 0; + if (dev_open(dev)) goto failure; @@ -215,7 +220,7 @@ static int vif_delete(int vifi) vifc_map &= ~(1<<vifi); if ((in_dev = dev->ip_ptr) != NULL) - in_dev->flags &= ~IFF_IP_MFORWARD; + in_dev->cnf.mc_forwarding = 0; dev_set_allmulti(dev, -1); ip_rt_multicast_event(in_dev); @@ -319,7 +324,7 @@ static void ipmr_cache_delete(struct mfc_cache *cache) netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); } else #endif - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } } kfree_s(cache,sizeof(cache)); @@ -503,7 +508,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return ret; @@ -522,7 +527,7 @@ static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk */ if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOBUFS; } /* @@ -555,7 +560,7 @@ static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk */ if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) { ipmr_cache_delete(cache); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOBUFS; } } @@ -565,7 +570,7 @@ static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk */ if(cache->mfc_queuelen>3) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOBUFS; } cache->mfc_queuelen++; @@ -651,7 +656,7 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) static void mrtsock_destruct(struct sock *sk) { if (sk == mroute_socket) { - ipv4_config.multicast_route = 0; + ipv4_devconf.mc_forwarding = 0; mroute_socket=NULL; mroute_close(sk); } @@ -692,7 +697,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) if(mroute_socket) return -EADDRINUSE; mroute_socket=sk; - ipv4_config.multicast_route = 1; + ipv4_devconf.mc_forwarding = 1; if (ip_ra_control(sk, 1, mrtsock_destruct) == 0) return 0; mrtsock_destruct(sk); @@ -753,9 +758,9 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) if ((in_dev = dev->ip_ptr) == NULL) return -EADDRNOTAVAIL; - if (in_dev->flags & IFF_IP_MFORWARD) + if (in_dev->cnf.mc_forwarding) return -EADDRINUSE; - in_dev->flags |= IFF_IP_MFORWARD; + in_dev->cnf.mc_forwarding = 1; dev_set_allmulti(dev, +1); ip_rt_multicast_event(in_dev); @@ -924,7 +929,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) } return -EADDRNOTAVAIL; default: - return -EINVAL; + return -ENOIOCTLCMD; } } @@ -1095,7 +1100,6 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - ip_ll_header(skb2); skb2->dst->output(skb2); } @@ -1176,7 +1180,7 @@ int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) dont_forward: if (!local) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -1234,7 +1238,7 @@ int ip_mr_input(struct sk_buff *skb) ipmr_cache_unresolved(cache, vif, skb); return -EAGAIN; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1247,7 +1251,7 @@ int ip_mr_input(struct sk_buff *skb) dont_forward: if (local) return ip_local_deliver(skb); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1265,7 +1269,7 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len) len < sizeof(*pim) + sizeof(*encap) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER || reg_dev == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -EINVAL; } @@ -1279,9 +1283,10 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len) if (!MULTICAST(encap->daddr) || ntohs(encap->tot_len) == 0 || ntohs(encap->tot_len) + sizeof(*pim) > len) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -EINVAL; } + skb->mac.raw = skb->nh.raw; skb_pull(skb, (u8*)encap - skb->data); skb->nh.iph = (struct iphdr *)skb->data; skb->dev = reg_dev; @@ -1309,7 +1314,7 @@ int pim_rcv(struct sk_buff * skb, unsigned short len) (pim->flags&PIM_NULL_REGISTER) || reg_dev == NULL || ip_compute_csum((void *)pim, len)) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -EINVAL; } @@ -1318,9 +1323,10 @@ int pim_rcv(struct sk_buff * skb, unsigned short len) if (!MULTICAST(encap->daddr) || ntohs(encap->tot_len) == 0 || ntohs(encap->tot_len) + sizeof(*pim) > len) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -EINVAL; } + skb->mac.raw = skb->nh.raw; skb_pull(skb, (u8*)encap - skb->data); skb->nh.iph = (struct iphdr *)skb->data; skb->dev = reg_dev; @@ -1346,11 +1352,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) struct rtnexthop *nhp; struct device *dev = vif_table[c->mfc_parent].dev; +#ifdef CONFIG_RTNL_OLD_IFINFO if (dev) { u8 *o = skb->tail; RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); rtm->rtm_optlen += skb->tail - o; } +#else + struct rtattr *mp_head; + + if (dev) + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + + mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); +#endif for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) { if (c->mfc_ttls[ct] < 255) { @@ -1361,9 +1376,15 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) nhp->rtnh_hops = c->mfc_ttls[ct]; nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_nhs++; +#endif } } +#ifndef CONFIG_RTNL_OLD_IFINFO + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; +#endif rtm->rtm_type = RTN_MULTICAST; return 1; diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index f7ab4ddc3..9e944495f 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -3,7 +3,7 @@ * Copyright (C) 1994 by Ross Martin * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche * - * $Id: rarp.c,v 1.21 1997/10/27 09:13:16 geert Exp $ + * $Id: rarp.c,v 1.3 1997/12/16 05:37:44 ralf Exp $ * * This module implements the Reverse Address Resolution Protocol * (RARP, RFC 903), which is used to convert low level addresses such @@ -30,6 +30,7 @@ * Fixes * Alan Cox : Rarp delete on device down needed as * reported by Walter Wolfgang. + * Mike McLagan : Routing by source * */ @@ -190,6 +191,8 @@ static void rarp_init_pkt (void) rarp_pkt_inited=1; } +#ifdef MODULE + static void rarp_end_pkt(void) { if(!rarp_pkt_inited) @@ -199,6 +202,7 @@ static void rarp_end_pkt(void) rarp_pkt_inited=0; } +#endif /* * Receive an arp request by the device layer. Maybe it should be @@ -225,7 +229,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd) || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -234,7 +238,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if (rarp->ar_op != htons(ARPOP_RREQUEST)) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -252,7 +256,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* * This packet is not for us. Remove it. */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -284,7 +288,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type dev->dev_addr, sha); } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2f4de9fbd..b3644f10d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.32 1997/10/24 17:16:00 kuznet Exp $ + * Version: $Id: raw.c,v 1.3 1997/12/16 05:37:44 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -155,7 +155,7 @@ void raw_err (struct sock *sk, struct sk_buff *skb) if (sk->ip_recverr && !sk->sock_readers) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && sock_queue_err_skb(sk, skb2)) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { @@ -173,7 +173,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) if (__sock_queue_rcv_skb(sk,skb)<0) { ip_statistics.IpInDiscards++; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -1; } @@ -255,13 +255,24 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) { struct ipcm_cookie ipc; struct rawfakehdr rfh; - struct rtable *rt; + struct rtable *rt = NULL; int free = 0; u32 daddr; u8 tos; int err; - if (len>65535) + /* This check is ONLY to check for arithmetic overflow + on integer(!) len. Not more! Real check will be made + in ip_build_xmit --ANK + + BTW socket.c -> af_*.c -> ... make multiple + invalid conversions size_t -> int. We MUST repair it f.e. + by replacing all of them with size_t and revise all + the places sort of len += sizeof(struct iphdr) + If len was ULONG_MAX-10 it would be cathastrophe --ANK + */ + + if (len < 0 || len > 0xFFFF) return -EMSGSIZE; /* @@ -308,10 +319,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) int tmp = ip_cmsg_send(msg, &ipc); if (tmp) return tmp; - if (ipc.opt && sk->ip_hdrincl) { - kfree(ipc.opt); - return -EINVAL; - } if (ipc.opt) free=1; } @@ -321,12 +328,23 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (!ipc.opt) ipc.opt = sk->opt; - if (ipc.opt && ipc.opt->srr) { - if (!daddr) - return -EINVAL; - daddr = ipc.opt->faddr; + + if (ipc.opt) { + err = -EINVAL; + /* Linux does not mangle headers on raw sockets, + * so that IP options + IP_HDRINCL is non-sense. + */ + if (sk->ip_hdrincl) + goto done; + if (ipc.opt->srr) { + if (!daddr) + goto done; + daddr = ipc.opt->faddr; + } } - tos = RT_TOS(sk->ip_tos) | (sk->localroute || (msg->msg_flags&MSG_DONTROUTE)); + tos = RT_TOS(sk->ip_tos) | sk->localroute; + if (msg->msg_flags&MSG_DONTROUTE) + tos |= RTO_ONLINK; if (MULTICAST(daddr)) { if (!ipc.oif) @@ -337,30 +355,21 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); - if (err) { - if (free) kfree(ipc.opt); - return err; - } + if (err) + goto done; - if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) { - if (free) kfree(ipc.opt); - ip_rt_put(rt); - return -EACCES; - } + err = -EACCES; + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) + goto done; rfh.iov = msg->msg_iov; rfh.saddr = rt->rt_src; if (!ipc.addr) ipc.addr = rt->rt_dst; - if(sk->ip_hdrincl) - err=ip_build_xmit(sk, raw_getrawfrag, &rfh, len, &ipc, rt, msg->msg_flags); - else { - if (len>65535-sizeof(struct iphdr)) - err = -EMSGSIZE; - else - err=ip_build_xmit(sk, raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags); - } + err=ip_build_xmit(sk, sk->ip_hdrincl ? raw_getrawfrag : raw_getfrag, + &rfh, len, &ipc, rt, msg->msg_flags); +done: if (free) kfree(ipc.opt); ip_rt_put(rt); @@ -396,8 +405,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ - dst_release(sk->dst_cache); - sk->dst_cache = NULL; + dst_release(xchg(&sk->dst_cache, NULL)); return 0; } @@ -446,6 +454,9 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, } err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + sk->stamp=skb->stamp; /* Copy the address. */ @@ -455,8 +466,9 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, } if (sk->ip_cmsg_flags) ip_cmsg_recv(msg, skb); +done: skb_free_datagram(sk, skb); - return err ? err : (copied); + return (err ? : copied); } static int raw_init(struct sock *sk) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 552b83664..b73c3ed11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.3 1997/12/16 05:37:45 ralf Exp $ + * Version: $Id: route.c,v 1.4 1998/03/03 01:23:42 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -43,9 +43,11 @@ * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Splitted to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -84,28 +86,60 @@ #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define RT_GC_TIMEOUT (300*HZ) + +int ip_rt_min_delay = 2*HZ; +int ip_rt_max_delay = 10*HZ; +int ip_rt_gc_thresh = RT_HASH_DIVISOR; +int ip_rt_max_size = RT_HASH_DIVISOR*16; +int ip_rt_gc_timeout = RT_GC_TIMEOUT; +int ip_rt_gc_interval = 60*HZ; +int ip_rt_gc_min_interval = 5*HZ; +int ip_rt_redirect_number = 9; +int ip_rt_redirect_load = HZ/50; +int ip_rt_redirect_silence = ((HZ/50) << (9+1)); +int ip_rt_error_cost = HZ; +int ip_rt_error_burst = 5*HZ; + +static unsigned long rt_deadline = 0; #define RTprint(a...) printk(KERN_DEBUG a) +static void rt_run_flush(unsigned long dummy); + static struct timer_list rt_flush_timer = - { NULL, NULL, RT_FLUSH_DELAY, 0L, NULL }; + { NULL, NULL, 0, 0L, rt_run_flush }; +static struct timer_list rt_periodic_timer = + { NULL, NULL, 0, 0L, NULL }; /* * Interface to generic destination cache. */ -static void ipv4_dst_destroy(struct dst_entry * dst); static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32); static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, struct sk_buff *); +static struct dst_entry * ipv4_negative_advice(struct dst_entry *); +static void ipv4_link_failure(struct sk_buff *skb); +static int rt_garbage_collect(void); struct dst_ops ipv4_dst_ops = { AF_INET, + __constant_htons(ETH_P_IP), + RT_HASH_DIVISOR, + + rt_garbage_collect, ipv4_dst_check, ipv4_dst_reroute, - ipv4_dst_destroy + NULL, + ipv4_negative_advice, + ipv4_link_failure, }; __u8 ip_tos2prio[16] = { @@ -131,7 +165,6 @@ __u8 ip_tos2prio[16] = { * Route cache. */ -static atomic_t rt_cache_size = ATOMIC_INIT(0); static struct rtable *rt_hash_table[RT_HASH_DIVISOR]; static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol); @@ -157,7 +190,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt pos = 128; if (offset<128) { - sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst\tHash"); + sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst"); len = 128; } @@ -175,8 +208,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt len = 0; continue; } - - sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X\t%02X", + sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, @@ -188,9 +220,8 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt r->u.dst.window, (int)r->u.dst.rtt, r->key.tos, r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? r->u.dst.hh->hh_uptodate : 0, - r->rt_spec_dst, - i); + r->u.dst.hh ? (r->u.dst.hh->hh_output == ip_acct_output) : 0, + r->rt_spec_dst); sprintf(buffer+len,"%-127s\n",temp); len += 128; if (pos >= offset+length) @@ -209,13 +240,13 @@ done: } #endif -static void __inline__ rt_free(struct rtable *rt) +static __inline__ void rt_free(struct rtable *rt) { dst_free(&rt->u.dst); } -void ip_rt_check_expire() +static void rt_check_expire(unsigned long dummy) { int i; static int rover; @@ -234,9 +265,8 @@ void ip_rt_check_expire() */ if (!atomic_read(&rth->u.dst.use) && - (now - rth->u.dst.lastuse > RT_CACHE_TIMEOUT)) { + (now - rth->u.dst.lastuse > ip_rt_gc_timeout)) { *rthp = rth_next; - atomic_dec(&rt_cache_size); #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst); #endif @@ -247,8 +277,8 @@ void ip_rt_check_expire() if (!rth_next) break; - if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD || - (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 && + if ( (long)(rth_next->u.dst.lastuse - rth->u.dst.lastuse) > RT_CACHE_BUBBLE_THRESHOLD || + ((long)(rth->u.dst.lastuse - rth_next->u.dst.lastuse) < 0 && atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) { #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst); @@ -262,6 +292,8 @@ void ip_rt_check_expire() rthp = &rth->u.rt_next; } } + rt_periodic_timer.expires = now + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); } static void rt_run_flush(unsigned long dummy) @@ -272,18 +304,11 @@ static void rt_run_flush(unsigned long dummy) for (i=0; i<RT_HASH_DIVISOR; i++) { int nr=0; - cli(); - if (!(rth = rt_hash_table[i])) { - sti(); + if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL) continue; - } - - rt_hash_table[i] = NULL; - sti(); for (; rth; rth=next) { next = rth->u.rt_next; - atomic_dec(&rt_cache_size); nr++; rth->u.rt_next = NULL; rt_free(rth); @@ -297,48 +322,57 @@ static void rt_run_flush(unsigned long dummy) void rt_cache_flush(int delay) { + if (delay < 0) + delay = ip_rt_min_delay; + start_bh_atomic(); - if (delay && rt_flush_timer.function && - rt_flush_timer.expires - jiffies < delay) { - end_bh_atomic(); - return; - } - if (rt_flush_timer.function) { - del_timer(&rt_flush_timer); - rt_flush_timer.function = NULL; + + if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { + long tmo = (long)(rt_deadline - rt_flush_timer.expires); + + /* If flush timer is already running + and flush request is not immediate (delay > 0): + + if deadline is not achieved, prolongate timer to "dealy", + otherwise fire it at deadline time. + */ + + if (delay > tmo) + delay = tmo; } - if (delay == 0) { + + if (delay <= 0) { + rt_deadline = 0; end_bh_atomic(); + rt_run_flush(0); return; } - rt_flush_timer.function = rt_run_flush; + + if (rt_deadline == 0) + rt_deadline = jiffies + ip_rt_max_delay; + rt_flush_timer.expires = jiffies + delay; add_timer(&rt_flush_timer); end_bh_atomic(); } - -static void rt_garbage_collect(void) +static int rt_garbage_collect(void) { int i; - static unsigned expire = RT_CACHE_TIMEOUT>>1; + static unsigned expire = RT_GC_TIMEOUT>>1; static unsigned long last_gc; struct rtable *rth, **rthp; - unsigned long now; + unsigned long now = jiffies; start_bh_atomic(); - now = jiffies; /* * Garbage collection is pretty expensive, * do not make it too frequently, but just increase expire strength. */ - if (now - last_gc < 1*HZ) { - expire >>= 1; - end_bh_atomic(); - return; - } + if (now - last_gc < ip_rt_gc_min_interval) + goto out; expire++; @@ -349,7 +383,6 @@ static void rt_garbage_collect(void) if (atomic_read(&rth->u.dst.use) || now - rth->u.dst.lastuse < expire) continue; - atomic_dec(&rt_cache_size); *rthp = rth->u.rt_next; rth->u.rt_next = NULL; rt_free(rth); @@ -358,61 +391,15 @@ static void rt_garbage_collect(void) } last_gc = now; - if (atomic_read(&rt_cache_size) < RT_CACHE_MAX_SIZE) - expire = RT_CACHE_TIMEOUT>>1; - else - expire >>= 1; - end_bh_atomic(); -} - -static int rt_ll_bind(struct rtable *rt) -{ - struct neighbour *neigh; - struct hh_cache *hh = NULL; - - if (rt->u.dst.dev && rt->u.dst.dev->hard_header_cache) { - neigh = rt->u.dst.neighbour; - if (!neigh) - neigh = arp_find_neighbour(&rt->u.dst, 1); - - if (neigh) { - rt->u.dst.neighbour = neigh; - for (hh=neigh->hh; hh; hh = hh->hh_next) - if (hh->hh_type == ETH_P_IP) - break; - } + if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; - if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { -#if RT_CACHE_DEBUG >= 2 - extern atomic_t hh_count; - atomic_inc(&hh_count); -#endif - memset(hh, 0, sizeof(struct hh_cache)); - hh->hh_type = ETH_P_IP; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; - if (rt->u.dst.dev->hard_header_cache(&rt->u.dst, neigh, hh)) { - kfree(hh); -#if RT_CACHE_DEBUG >= 2 - atomic_dec(&hh_count); -#endif - hh = NULL; - } else if (neigh) { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = neigh->hh; - neigh->hh = hh; - } - } - if (hh) { - atomic_inc(&hh->hh_refcnt); - rt->u.dst.hh = hh; - return hh->hh_uptodate; - } - } - return 0; +out: + expire >>= 1; + end_bh_atomic(); + return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size); } - static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol) { struct rtable *rth, **rthp; @@ -444,8 +431,11 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 prot rthp = &rth->u.rt_next; } - if (atomic_read(&rt_cache_size) >= RT_CACHE_MAX_SIZE) - rt_garbage_collect(); + /* Try to bind route ro arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) + arp_bind_neighbour(&rt->u.dst); rt->u.rt_next = rt_hash_table[hash]; #if RT_CACHE_DEBUG >= 2 @@ -458,10 +448,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 prot } #endif rt_hash_table[hash] = rt; - atomic_inc(&rt_cache_size); - - if (protocol == ETH_P_IP) - rt_ll_bind(rt); end_bh_atomic(); return rt; @@ -478,7 +464,10 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, tos &= IPTOS_TOS_MASK; - if (!in_dev || new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) + if (!in_dev) + return; + + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) goto reject_redirect; @@ -534,7 +523,13 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, /* Gateway is different ... */ rt->rt_gateway = new_gw; - if (!rt_ll_bind(rt)) { + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); + + if (!arp_bind_neighbour(&rt->u.dst) || + !(rt->u.dst.neighbour->nud_state&NUD_VALID)) { + if (rt->u.dst.neighbour) + neigh_event_send(rt->u.dst.neighbour, NULL); ip_rt_put(rt); rt_free(rt); break; @@ -552,7 +547,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (ipv4_config.log_martians && net_ratelimit()) + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "Redirect from %lX/%s to %lX ignored." "Path = %lX -> %lX, tos %02x\n", ntohl(old_gw), dev->name, ntohl(new_gw), @@ -560,34 +555,30 @@ reject_redirect: #endif } - -void ip_rt_advice(struct rtable **rp, int advice) +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt; - - if (advice) - return; + struct rtable *rt = (struct rtable*)dst; - start_bh_atomic(); - if ((rt = *rp) != NULL && (rt->rt_flags&RTCF_REDIRECTED)) { + if (rt != NULL) { + if (dst->obsolete || rt->rt_flags&RTCF_REDIRECTED) { #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); + printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); #endif - *rp = NULL; - ip_rt_put(rt); - rt_cache_flush(0); + ip_rt_put(rt); + rt_cache_flush(0); + return NULL; + } } - end_bh_atomic(); - return; + return dst; } /* * Algorithm: - * 1. The first RT_REDIRECT_NUMBER redirects are sent + * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects - * during RT_REDIRECT_SILENCE, we assume that the host + * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting @@ -601,29 +592,30 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; - /* No redirected packets during RT_REDIRECT_SILENCE; + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (jiffies - rt->last_error > RT_REDIRECT_SILENCE) - rt->errors = 0; + if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence) + rt->u.dst.rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set last_error to the last seen redirected packet. + * set u.dst.rate_last to the last seen redirected packet. */ - if (rt->errors >= RT_REDIRECT_NUMBER) { - rt->last_error = jiffies; + if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { + rt->u.dst.rate_last = jiffies; return; } - /* Check for load limit; set last_error to the latest sent + /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) { + if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->last_error = jiffies; - ++rt->errors; + rt->u.dst.rate_last = jiffies; + ++rt->u.dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (ipv4_config.log_martians && rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) + if (skb->dev->ip_ptr && IN_DEV_LOG_MARTIANS((struct in_device*)skb->dev->ip_ptr) && + rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n", rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway); #endif @@ -633,12 +625,13 @@ void ip_rt_send_redirect(struct sk_buff *skb) static int ip_error(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; + unsigned long now; int code; switch (rt->u.dst.error) { case EINVAL: default: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; @@ -650,11 +643,17 @@ static int ip_error(struct sk_buff *skb) code = ICMP_PKT_FILTERED; break; } - if (jiffies - rt->last_error > RT_ERROR_LOAD) { + + now = jiffies; + if ((rt->u.dst.rate_tokens += now - rt->u.dst.rate_last) > ip_rt_error_burst) + rt->u.dst.rate_tokens = ip_rt_error_burst; + if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { + rt->u.dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); - rt->last_error = jiffies; + rt->u.dst.rate_last = now; } - kfree_skb(skb, FREE_READ); + + kfree_skb(skb); return 0; } @@ -699,7 +698,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_src == iph->saddr && rth->key.tos == tos && rth->key.iif == 0 && - !(rth->rt_flags&RTCF_NOPMTUDISC)) { + !(rth->u.dst.mxlock&(1<<RTAX_MTU))) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -712,6 +711,9 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) mtu = guess_mtu(old_mtu); } if (mtu < rth->u.dst.pmtu) { + /* New mtu received -> path was valid */ + dst_confirm(&rth->u.dst); + rth->u.dst.pmtu = mtu; est_mtu = mtu; } @@ -721,23 +723,9 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) return est_mtu; } - -static void ipv4_dst_destroy(struct dst_entry * dst) -{ - struct rtable * rt = (struct rtable*)dst; - struct hh_cache * hh = rt->u.dst.hh; - rt->u.dst.hh = NULL; - if (hh && atomic_dec_and_test(&hh->hh_refcnt)) { -#if RT_CACHE_DEBUG >= 2 - extern atomic_t hh_count; - atomic_dec(&hh_count); -#endif - kfree(hh); - } -} - static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie) { + dst_release(dst); return NULL; } @@ -747,11 +735,16 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, return NULL; } +static void ipv4_link_failure(struct sk_buff *skb) +{ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +} + static int ip_rt_bug(struct sk_buff *skb) { printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -965,9 +958,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, if (skb->protocol != __constant_htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not make route for invalid - * destination or if it is redirected. + * destination AND it is not translated destination. */ - if (out_dev == in_dev && flags&RTCF_DOREDIRECT) + if (out_dev == in_dev && !(flags&RTCF_DNAT)) return -EINVAL; } @@ -1000,7 +993,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; rth->u.dst.window=res.fi->fib_window ? : 0; rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; - rth->u.dst.rate_last = rth->u.dst.rate_tokens = 0; +#ifndef CONFIG_RTNL_OLD_IFINFO + rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; +#endif if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) rth->rt_gateway = FIB_RES_GW(res); @@ -1008,6 +1003,17 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->rt_flags = flags; rth->rt_type = res.type; +#ifdef CONFIG_NET_FASTROUTE + if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { + struct device *odev = rth->u.dst.dev; + if (odev != dev && + dev->accept_fastpath && + odev->mtu >= dev->mtu && + dev->accept_fastpath(dev, &rth->u.dst) == 0) + rth->rt_flags |= RTCF_FAST; + } +#endif + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); return 0; @@ -1069,14 +1075,14 @@ no_route: */ martian_destination: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (ipv4_config.log_martians && net_ratelimit()) + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name); #endif return -EINVAL; martian_source: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (ipv4_config.log_martians && net_ratelimit()) { + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommenadtion, if source is martian, * the only hint is MAC header. @@ -1147,7 +1153,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) { struct rt_key key; struct fib_result res; @@ -1155,14 +1161,17 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int o struct rtable *rth; struct device *dev_out = NULL; unsigned hash; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + u32 nochecksrc = (tos & RTO_TPROXY); +#endif - tos &= IPTOS_TOS_MASK|1; + tos &= IPTOS_TOS_MASK|RTO_ONLINK; key.dst = daddr; key.src = saddr; key.tos = tos&IPTOS_TOS_MASK; key.iif = loopback_dev.ifindex; key.oif = oif; - key.scope = (tos&1) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; res.fi = NULL; if (saddr) { @@ -1171,8 +1180,19 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int o /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(saddr); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* If address is not local, test for transparent proxy flag; + if address is local --- clear the flag. + */ + if (dev_out == NULL) { + if (nochecksrc == 0) + return -EINVAL; + flags |= RTCF_TPROXY; + } +#else if (dev_out == NULL) return -EINVAL; +#endif /* I removed check for oif == dev_out->oif here. It was wrong by three reasons: @@ -1182,7 +1202,11 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int o of another iface. --ANK */ - if (oif == 0 && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { + if (oif == 0 && +#ifdef CONFIG_IP_TRANSPARENT_PROXY + dev_out && +#endif + (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_TXINFO. @@ -1309,14 +1333,17 @@ make_route: else if (BADCLASS(key.dst) || ZERONET(key.dst)) return -EINVAL; + if (dev_out->flags&IFF_LOOPBACK) + flags |= RTCF_LOCAL; + if (res.type == RTN_BROADCAST) { flags |= RTCF_BROADCAST; - if (!(dev_out->flags&IFF_LOOPBACK) && dev_out->flags&IFF_BROADCAST) + if (dev_out->flags&IFF_BROADCAST) flags |= RTCF_LOCAL; } else if (res.type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST; - if (ip_check_mc(dev_out, daddr)) - flags |= RTCF_LOCAL; + flags |= RTCF_MULTICAST|RTCF_LOCAL; + if (!ip_check_mc(dev_out, daddr)) + flags &= ~RTCF_LOCAL; } rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); @@ -1367,12 +1394,14 @@ make_route: rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; rth->u.dst.window=res.fi->fib_window ? : 0; rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; +#ifndef CONFIG_RTNL_OLD_IFINFO + rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; +#endif } else { rth->u.dst.pmtu = dev_out->mtu; rth->u.dst.window=0; rth->u.dst.rtt = TCP_TIMEOUT_INIT; } - rth->u.dst.rate_last = rth->u.dst.rate_tokens = 0; rth->rt_flags = flags; rth->rt_type = res.type; hash = rt_hash_code(daddr, saddr^(oif<<5), tos); @@ -1380,7 +1409,7 @@ make_route: return 0; } -int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) { unsigned hash; struct rtable *rth; @@ -1393,7 +1422,13 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) rth->key.src == saddr && rth->key.iif == 0 && rth->key.oif == oif && - rth->key.tos == tos) { +#ifndef CONFIG_IP_TRANSPARENT_PROXY + rth->key.tos == tos +#else + !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) && + ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY)) +#endif + ) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); @@ -1411,14 +1446,20 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - struct kern_rta *rta = arg; + struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct rtable *rt = NULL; u32 dst = 0; u32 src = 0; + int iif = 0; int err; struct sk_buff *skb; - u8 *o; + struct rta_cacheinfo ci; +#ifdef CONFIG_RTNL_OLD_IFINFO + unsigned char *o; +#else + struct rtattr *mx; +#endif skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) @@ -1430,14 +1471,16 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) skb->mac.raw = skb->data; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - if (rta->rta_dst) - memcpy(&dst, rta->rta_dst, 4); - if (rta->rta_src) - memcpy(&src, rta->rta_src, 4); + if (rta[RTA_SRC-1]) + memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_IIF-1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); - if (rta->rta_iif) { + if (iif) { struct device *dev; - dev = dev_get_by_index(*rta->rta_iif); + dev = dev_get_by_index(iif); if (!dev) return -ENODEV; skb->protocol = __constant_htons(ETH_P_IP); @@ -1449,11 +1492,13 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) if (!err && rt->u.dst.error) err = rt->u.dst.error; } else { - err = ip_route_output(&rt, dst, src, rtm->rtm_tos, - rta->rta_oif ? *rta->rta_oif : 0); + int oif = 0; + if (rta[RTA_OIF-1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); } if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } @@ -1474,23 +1519,47 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = RTPROT_UNSPEC; rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_nhs = 0; o = skb->tail; +#endif RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); if (rt->u.dst.dev) RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); if (rt->rt_dst != rt->rt_gateway) RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); +#ifdef CONFIG_RTNL_OLD_IFINFO RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); +#else + mx = (struct rtattr*)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.mxlock) + RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + mx->rta_len = skb->tail - (u8*)mx; +#endif RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + ci.rta_lastuse = jiffies - rt->u.dst.lastuse; + ci.rta_used = atomic_read(&rt->u.dst.refcnt); + ci.rta_clntref = atomic_read(&rt->u.dst.use); + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_optlen = skb->tail - o; - if (rta->rta_iif) { +#endif + if (iif) { #ifdef CONFIG_IP_MROUTE - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_config.multicast_route) { + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; err = ipmr_get_route(skb, rtm); if (err <= 0) @@ -1498,8 +1567,10 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) } else #endif { - RTA_PUT(skb, RTA_IIF, 4, rta->rta_iif); + RTA_PUT(skb, RTA_IIF, sizeof(int), &iif); +#ifdef CONFIG_RTNL_OLD_IFINFO rtm->rtm_optlen = skb->tail - o; +#endif } } nlh->nlmsg_len = skb->tail - (u8*)nlh; @@ -1510,7 +1581,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) nlmsg_failure: rtattr_failure: - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EMSGSIZE; } @@ -1518,13 +1589,82 @@ rtattr_failure: void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(1*HZ); + rt_cache_flush(0); } + + +#ifdef CONFIG_SYSCTL + +static int flush_delay; + +static +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp); + rt_cache_flush(flush_delay); + return 0; + } else + return -EINVAL; +} + +ctl_table ipv4_route_table[] = { + {NET_IPV4_ROUTE_FLUSH, "flush", + &flush_delay, sizeof(int), 0644, NULL, + &ipv4_sysctl_rtcache_flush}, + {NET_IPV4_ROUTE_MIN_DELAY, "min_delay", + &ip_rt_min_delay, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV4_ROUTE_MAX_DELAY, "max_delay", + &ip_rt_max_delay, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh", + &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_MAX_SIZE, "max_size", + &ip_rt_max_size, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval", + &ip_rt_gc_min_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout", + &ip_rt_gc_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval", + &ip_rt_gc_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load", + &ip_rt_redirect_load, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number", + &ip_rt_redirect_number, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence", + &ip_rt_redirect_silence, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_ERROR_COST, "error_cost", + &ip_rt_error_cost, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_ERROR_BURST, "error_burst", + &ip_rt_error_burst, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0} +}; +#endif + __initfunc(void ip_rt_init(void)) { devinet_init(); ip_fib_init(); + rt_periodic_timer.function = rt_check_expire; + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval + + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); #ifdef CONFIG_PROC_FS proc_net_register(&(struct proc_dir_entry) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d3e018be8..7d119716e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -92,7 +92,7 @@ found: return isn; } -/* This value should be dependant on TCP_TIMEOUT_INIT and +/* This value should be dependent on TCP_TIMEOUT_INIT and * sysctl_tcp_retries1. It's a rather complicated formula * (exponential backoff) to compute at runtime so it's currently hardcoded * here. @@ -203,7 +203,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, req->af.v4_req.loc_addr, - sk->ip_tos, + sk->ip_tos | RTO_CONN, 0)) { tcp_openreq_free(req); return NULL; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 637f2f933..3a8a7efb4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.5 1997/12/16 05:37:46 ralf Exp $ + * $Id: sysctl_net_ipv4.c,v 1.6 1998/03/03 01:23:42 ralf Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -28,16 +28,6 @@ static int boolean_min = 0; static int boolean_max = 1; #endif -/* From arp.c */ -extern int sysctl_arp_res_time; -extern int sysctl_arp_dead_res_time; -extern int sysctl_arp_max_tries; -extern int sysctl_arp_timeout; -extern int sysctl_arp_check_interval; -extern int sysctl_arp_confirm_interval; -extern int sysctl_arp_confirm_timeout; -extern int sysctl_arp_max_pings; - /* From icmp.c */ extern int sysctl_icmp_echo_ignore_all; extern int sysctl_icmp_echo_ignore_broadcasts; @@ -64,7 +54,6 @@ extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_max_ka_probes; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; -extern int sysctl_tcp_max_delay_acks; extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_syncookies; extern int sysctl_tcp_syn_retries; @@ -84,60 +73,29 @@ int tcp_retr1_max = 255; extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, void *buffer, size_t *lenp); -struct ipv4_config ipv4_config = { 1, 1, 1, 0, }; +struct ipv4_config ipv4_config; -#ifdef CONFIG_SYSCTL +extern ctl_table ipv4_route_table[]; -struct ipv4_config ipv4_def_router_config = { 0, 1, 1, 1, 1, 1, 1, }; -struct ipv4_config ipv4_def_host_config = { 1, 1, 1, 0, }; +#ifdef CONFIG_SYSCTL static -int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, - void *buffer, size_t *lenp) +int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) { - int val = IS_ROUTER; + int val = ipv4_devconf.forwarding; int ret; ret = proc_dointvec(ctl, write, filp, buffer, lenp); - if (write && IS_ROUTER != val) { - if (IS_ROUTER) - ipv4_config = ipv4_def_router_config; - else - ipv4_config = ipv4_def_host_config; - rt_cache_flush(0); - } + if (write && ipv4_devconf.forwarding != val) + inet_forward_change(); + return ret; } -static -int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, - void *buffer, size_t *lenp) -{ - if (write) - rt_cache_flush(0); - return 0; -} ctl_table ipv4_table[] = { - {NET_IPV4_ARP_RES_TIME, "arp_res_time", - &sysctl_arp_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_DEAD_RES_TIME, "arp_dead_res_time", - &sysctl_arp_dead_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_MAX_TRIES, "arp_max_tries", - &sysctl_arp_max_tries, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_MAX_PINGS, "arp_max_pings", - &sysctl_arp_max_pings, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_TIMEOUT, "arp_timeout", - &sysctl_arp_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_CHECK_INTERVAL, "arp_check_interval", - &sysctl_arp_check_interval, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ARP_CONFIRM_INTERVAL, "arp_confirm_interval", - &sysctl_arp_confirm_interval, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_ARP_CONFIRM_TIMEOUT, "arp_confirm_timeout", - &sysctl_arp_confirm_timeout, sizeof(int), 0644, NULL, - &proc_dointvec}, {NET_IPV4_TCP_HOE_RETRANSMITS, "tcp_hoe_retransmits", &sysctl_tcp_hoe_retransmits, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -156,55 +114,25 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_VEGAS_CONG_AVOID, "tcp_vegas_cong_avoid", &sysctl_tcp_cong_avoidance, sizeof(int), 0644, NULL, &tcp_sysctl_congavoid }, - {NET_IPV4_FORWARDING, "ip_forwarding", - &ip_statistics.IpForwarding, sizeof(int), 0644, NULL, - &ipv4_sysctl_forwarding}, + {NET_IPV4_FORWARD, "ip_forward", + &ipv4_devconf.forwarding, sizeof(int), 0644, NULL, + &ipv4_sysctl_forward}, {NET_IPV4_DEFAULT_TTL, "ip_default_ttl", &ip_statistics.IpDefaultTTL, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_RFC1812_FILTER, "ip_rfc1812_filter", - &ipv4_config.rfc1812_filter, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_LOG_MARTIANS, "ip_log_martians", - &ipv4_config.log_martians, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_SOURCE_ROUTE, "ip_source_route", - &ipv4_config.source_route, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_SEND_REDIRECTS, "ip_send_redirects", - &ipv4_config.send_redirects, sizeof(int), 0644, NULL, - &proc_dointvec}, {NET_IPV4_AUTOCONFIG, "ip_autoconfig", &ipv4_config.autoconfig, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_BOOTP_RELAY, "ip_bootp_relay", - &ipv4_config.bootp_relay, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_PROXY_ARP, "ip_proxy_arp", - &ipv4_config.proxy_arp, sizeof(int), 0644, NULL, - &proc_dointvec}, {NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc", &ipv4_config.no_pmtu_disc, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ACCEPT_REDIRECTS, "ip_accept_redirects", - &ipv4_config.accept_redirects, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_SECURE_REDIRECTS, "ip_secure_redirects", - &ipv4_config.secure_redirects, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_RFC1620_REDIRECTS, "ip_rfc1620_redirects", - &ipv4_config.rfc1620_redirects, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_RTCACHE_FLUSH, "ip_rtcache_flush", - NULL, sizeof(int), 0644, NULL, - &ipv4_sysctl_rtcache_flush}, {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh", &sysctl_ipfrag_low_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_IP_DYNADDR, "ip_dynaddr", + {NET_IPV4_DYNADDR, "ip_dynaddr", &sysctl_ip_dynaddr, sizeof(int), 0644, NULL, &proc_dointvec}, #ifdef CONFIG_IP_MASQUERADE {NET_IPV4_IP_MASQ_DEBUG, "ip_masq_debug", @@ -225,8 +153,6 @@ ctl_table ipv4_table[] = { &sysctl_intvec, NULL, NULL, &tcp_retr1_max}, {NET_IPV4_TCP_RETRIES2, "tcp_retries2", &sysctl_tcp_retries2, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_TCP_MAX_DELAY_ACKS, "tcp_max_delay_acks", - &sysctl_tcp_max_delay_acks, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout", &sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, @@ -259,6 +185,7 @@ ctl_table ipv4_table[] = { &sysctl_icmp_paramprob_time, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate", &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ROUTE, "route", NULL, 0, 0555, ipv4_route_table}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eff309bcf..17ec6def9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.75 1997/10/16 02:57:34 davem Exp $ + * Version: $Id: tcp.c,v 1.77 1998/01/15 22:40:18 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -196,6 +196,7 @@ * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source * Keith Owens : Do proper meging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with @@ -491,9 +492,9 @@ void tcp_time_wait(struct sock *sk) /* - * Walk down the receive queue counting readable data until we hit the - * end or we find a gap in the received data queue (ie a frame missing - * that needs sending to us). + * Walk down the receive queue counting readable data. + * + * Must be called with the socket lock held. */ static int tcp_readable(struct sock *sk) @@ -502,14 +503,11 @@ static int tcp_readable(struct sock *sk) unsigned long amount; struct sk_buff *skb; int sum; - unsigned long flags; SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); - save_flags(flags); - cli(); - if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) { - restore_flags(flags); + skb = skb_peek(&sk->receive_queue); + if (skb == NULL) { SOCK_DEBUG(sk, "empty\n"); return(0); } @@ -520,7 +518,7 @@ static int tcp_readable(struct sock *sk) /* Do until a push or until we are out of data. */ do { /* Found a hole so stops here. */ - if (before(counted, skb->seq)) + if (before(counted, skb->seq)) /* should not happen */ break; /* Length - header but start from where we are up to @@ -562,7 +560,6 @@ static int tcp_readable(struct sock *sk) skb = skb->next; } while(skb != (struct sk_buff *)&sk->receive_queue); - restore_flags(flags); SOCK_DEBUG(sk, "got %lu bytes.\n",amount); return(amount); } @@ -589,13 +586,13 @@ static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int tcp_poll(struct socket *sock, poll_table *wait) +unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - poll_wait(sk->sleep, wait); + poll_wait(file, sk->sleep, wait); if (sk->state == TCP_LISTEN) return tcp_listen_poll(sk, wait); @@ -604,24 +601,30 @@ unsigned int tcp_poll(struct socket *sock, poll_table *wait) mask = POLLERR; /* Connected? */ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + int space; + if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; - + if ((tp->rcv_nxt != sk->copied_seq) && (sk->urg_seq != sk->copied_seq || tp->rcv_nxt != sk->copied_seq+1 || sk->urginline || !sk->urg_data)) mask |= POLLIN | POLLRDNORM; - /* FIXME: this assumed sk->mtu is correctly maintained. - * I see no evidence this is the case. -- erics - */ - if (!(sk->shutdown & SEND_SHUTDOWN) && - (sock_wspace(sk) >= sk->mtu+128+sk->prot->max_header)) +#if 1 /* This needs benchmarking and real world tests */ + space = (sk->dst_cache ? sk->dst_cache->pmtu : sk->mss) + 128; + if (space < 2048) /* XXX */ + space = 2048; +#else /* 2.0 way */ + /* More than half of the socket queue free? */ + space = atomic_read(&sk->wmem_alloc) / 2; +#endif + /* Always wake the user up when an error occured */ + if (sock_wspace(sk) >= space) mask |= POLLOUT | POLLWRNORM; - if (sk->urg_data) - mask |= POLLPRI; + mask |= POLLPRI; } return mask; } @@ -659,53 +662,27 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(amount, (int *)arg); } default: - return(-EINVAL); + return(-ENOIOCTLCMD); }; } - -/* - * This routine builds a generic TCP header. - * It also builds in the RFC1323 Timestamp. - * It can't (unfortunately) do SACK as well. - */ - -extern __inline void tcp_build_header(struct tcphdr *th, struct sock *sk, int push) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(sk->write_seq); - th->psh =(push == 0) ? 1 : 0; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = htons(tcp_select_window(sk)); - - /* FIXME: could use the inline found in tcp_output.c as well. - * Probably that means we should move these up to an include file. --erics - */ - if (tp->tstamp_ok) { - __u32 *ptr = (__u32 *)(th+1); - *ptr++ = ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - /* FIXME: Not sure it's worth setting these here already, but I'm - * also not sure we replace them on all paths later. --erics - */ - *ptr++ = jiffies; - *ptr++ = tp->ts_recent; - } -} - /* * Wait for a socket to get into the connected state */ static void wait_for_tcp_connect(struct sock * sk) { + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + tsk->state = TASK_INTERRUPTIBLE; + add_wait_queue(sk->sleep, &wait); release_sock(sk); - cli(); - if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && - sk->err == 0) - interruptible_sleep_on(sk->sleep); - sti(); + + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && sk->err == 0) + schedule(); + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); lock_sock(sk); } @@ -814,7 +791,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) struct sk_buff *skb; if (err) - return (err); + return -EFAULT; /* Stop on errors. */ if (sk->err) { @@ -932,7 +909,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ tmp = tp->af_specific->build_net_header(sk, skb); if (tmp < 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (copied) return(copied); return(tmp); @@ -942,7 +919,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) skb_put(skb,tp->tcp_header_len); seglen -= copy; - tcp_build_header(skb->h.th, sk, seglen || iovlen); + tcp_build_header_data(skb->h.th, sk, seglen || iovlen); /* FIXME: still need to think about SACK options here. */ if (flags & MSG_OOB) { @@ -950,7 +927,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) skb->h.th->urg_ptr = ntohs(copy); } - skb->csum = csum_partial_copy_from_user(from, + skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); from += copy; @@ -968,7 +945,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) sk->err = 0; if (err) - return (err); + return -EFAULT; return copied; } @@ -1070,14 +1047,15 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) sk->tp_pinfo.af_tcp.delayed_acks++; __skb_unlink(skb, &sk->receive_queue); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } static void cleanup_rbuf(struct sock *sk) { struct sk_buff *skb; - + struct tcp_opt *tp; + /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ @@ -1089,11 +1067,12 @@ static void cleanup_rbuf(struct sock *sk) SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk)); + tp = &(sk->tp_pinfo.af_tcp); + /* We send a ACK if the sender is blocked * else let tcp_data deal with the acking policy. */ - if (sk->tp_pinfo.af_tcp.delayed_acks) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->delayed_acks) { __u32 rcv_wnd; /* FIXME: double check this rule, then check against @@ -1457,7 +1436,7 @@ void tcp_close(struct sock *sk, unsigned long timeout) * reader process may not have drained the data yet! */ while((skb=skb_dequeue(&sk->receive_queue))!=NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); /* Timeout is not the same thing - however the code likes * to send both the same way (sigh). @@ -1466,17 +1445,25 @@ void tcp_close(struct sock *sk, unsigned long timeout) tcp_send_fin(sk); if (timeout) { - cli(); + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + tsk->state = TASK_INTERRUPTIBLE; + tsk->timeout = timeout; + add_wait_queue(sk->sleep, &wait); release_sock(sk); - current->timeout = timeout; - while(closing(sk) && current->timeout) { - interruptible_sleep_on(sk->sleep); - if (signal_pending(current)) + + while (closing(sk)) { + schedule(); + if (signal_pending(tsk) || !tsk->timeout) break; } - current->timeout=0; + + tsk->timeout=0; + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); - sti(); } /* Now that the socket is dead, if we are in the FIN_WAIT2 state @@ -1536,43 +1523,45 @@ struct sock *tcp_accept(struct sock *sk, int flags) struct sock *newsk = NULL; int error; + lock_sock(sk); + /* We need to make sure that this socket is listening, * and that it has something pending. */ error = EINVAL; if (sk->state != TCP_LISTEN) - goto no_listen; - - lock_sock(sk); + goto out; + /* Find already established connection */ req = tcp_find_established(tp, &prev); - if (req) { -got_new_connect: - tcp_synq_unlink(tp, req, prev); - newsk = req->sk; - tcp_openreq_free(req); - sk->ack_backlog--; - /* FIXME: need to check here if socket has already - * an soft_err or err set. - * We have two options here then: reply (this behaviour matches - * Solaris) or return the error to the application (old Linux) - */ - error = 0; -out: - release_sock(sk); -no_listen: - sk->err = error; - return newsk; + if (!req) { + /* If this is a non blocking socket don't sleep */ + error = EAGAIN; + if (flags & O_NONBLOCK) + goto out; + + error = ERESTARTSYS; + req = wait_for_connect(sk, &prev); + if (!req) + goto out; + error = 0; } - error = EAGAIN; - if (flags & O_NONBLOCK) - goto out; - req = wait_for_connect(sk, &prev); - if (req) - goto got_new_connect; - error = ERESTARTSYS; - goto out; + tcp_synq_unlink(tp, req, prev); + newsk = req->sk; + tcp_openreq_free(req); + sk->ack_backlog--; /* XXX */ + + /* FIXME: need to check here if newsk has already + * an soft_err or err set. + * We have two options here then: reply (this behaviour matches + * Solaris) or return the error to the application (old Linux) + */ + error = 0; + out: + release_sock(sk); + sk->err = error; + return newsk; } /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e9f936f82..841359739 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.64 1997/10/30 23:52:24 davem Exp $ + * Version: $Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -41,6 +41,7 @@ * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. */ #include <linux/config.h> @@ -73,7 +74,6 @@ int sysctl_tcp_tsack; int sysctl_tcp_timestamps; int sysctl_tcp_window_scaling; int sysctl_tcp_syncookies = SYNC_INIT; -int sysctl_tcp_max_delay_acks = MAX_DELAY_ACK; int sysctl_tcp_stdurg; static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; @@ -214,7 +214,7 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp) /* FIXME: must check that ts_recent is not * more than 24 days old here. Yuck. */ - return (tp->rcv_tsval-tp->ts_recent < 0); + return ((s32)(tp->rcv_tsval-tp->ts_recent) < 0); } @@ -379,6 +379,7 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) */ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp) { + /* If we didn't send out any options ignore them all */ if (tp->tcp_header_len == sizeof(struct tcphdr)) return 0; if (th->doff == sizeof(struct tcphdr)>>2) { @@ -744,8 +745,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, if (after(skb->end_seq, ack)) break; +#if 0 SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n", skb->seq, skb->end_seq); +#endif acked = FLAG_DATA_ACKED; @@ -760,7 +763,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, skb_unlink(skb); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } if (acked) { @@ -819,6 +822,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) goto uninteresting_ack; + dst_confirm(sk->dst_cache); + /* If there is data set flag 1 */ if (len != th->doff*4) { flag |= FLAG_DATA; @@ -1055,15 +1060,14 @@ static void tcp_ofo_queue(struct sock *sk) struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* FIXME: out_of_order_queue is a strong tcp_opt candidate... -DaveM */ while ((skb = skb_peek(&sk->out_of_order_queue))) { if (after(skb->seq, tp->rcv_nxt)) break; if (!after(skb->end_seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "ofo packet was allready received \n"); + SOCK_DEBUG(sk, "ofo packet was already received \n"); skb_unlink(skb); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", @@ -1086,7 +1090,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (skb->seq == tp->rcv_nxt) { /* Ok. In sequence. */ -queue_and_out: + queue_and_out: + dst_confirm(sk->dst_cache); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; tcp_ofo_queue(sk); @@ -1095,13 +1100,13 @@ queue_and_out: return; } - /* Not in sequence, either a retransmit or some packet got lost. */ + /* An old packet, either a retransmit or some packet got lost. */ if (!after(skb->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an imediate ack. */ SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq); - tp->delayed_acks = sysctl_tcp_max_delay_acks; - kfree_skb(skb, FREE_READ); + tp->delayed_acks = MAX_DELAY_ACK; + kfree_skb(skb); return; } @@ -1114,7 +1119,7 @@ queue_and_out: } /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks = sysctl_tcp_max_delay_acks; + tp->delayed_acks = MAX_DELAY_ACK; /* Disable header predition. */ tp->pred_flags = 0; @@ -1130,7 +1135,7 @@ queue_and_out: if (skb->seq == skb1->seq && skb->len >= skb1->len) { skb_append(skb1, skb); skb_unlink(skb1); - kfree_skb(skb1, FREE_READ); + kfree_skb(skb1); break; } @@ -1221,7 +1226,10 @@ static void tcp_data_snd_check(struct sock *sk) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +/* + * Check if sending an ack is needed. + */ +static __inline__ void __tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1233,17 +1241,24 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) * - we don't have a window update to send * - must send at least every 2 full sized packets */ - if (tp->delayed_acks == 0) { - /* We sent a data segment already. */ - return; - } - if (tp->delayed_acks >= sysctl_tcp_max_delay_acks || tcp_raise_window(sk)) + if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) tcp_send_ack(sk); else tcp_send_delayed_ack(sk, HZ/2); } +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->delayed_acks == 0) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk); +} + + /* * This routine is only called when we have urgent data * signalled. Its the 'slow' part of tcp_urg. It could be @@ -1314,13 +1329,43 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len } } +/* + * Clean first the out_of_order queue, then the receive queue until + * the socket is in its memory limits again. + */ static void prune_queue(struct sock *sk) { + struct tcp_opt *tp; struct sk_buff * skb; - /* Clean the out_of_order queue. */ - while ((skb = skb_dequeue(&sk->out_of_order_queue))) - kfree_skb(skb, FREE_READ); + SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq); + + /* First Clean the out_of_order queue. */ + /* Start with the end because there are probably the least + * useful packets (crossing fingers). + */ + while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) { + kfree_skb(skb); + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + return; + } + + tp = &sk->tp_pinfo.af_tcp; + + /* Now continue with the receive queue if it wasn't enough */ + while ((skb = skb_peek_tail(&sk->receive_queue))) { + /* Never remove packets that have been already acked */ + if (before(skb->end_seq, tp->last_ack_sent+1)) { + printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", + sk->copied_seq, skb->end_seq, tp->last_ack_sent); + break; + } + skb_unlink(skb); + tp->rcv_nxt = skb->seq; + kfree_skb(skb); + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + break; + } } int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, @@ -1353,8 +1398,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_paws_discard(tp)) { if (!th->rst) { tcp_send_ack(sk); - kfree_skb(skb, FREE_READ); - return 0; + goto discard; } } tcp_replace_ts_recent(tp,skb->end_seq); @@ -1375,28 +1419,40 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (len <= th->doff*4) { /* Bulk data transfer: sender */ if (len == th->doff*4) { - tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + kfree_skb(skb); tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + tcp_statistics.TcpInErrs++; + goto discard; } - - tcp_statistics.TcpInErrs++; - kfree_skb(skb, FREE_READ); - return 0; } else if (skb->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ - skb_pull(skb,th->doff*4); + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + goto discard; + skb_pull(skb,th->doff*4); + + /* DO NOT notify forward progress here. + * It saves dozen of CPU instructions in fast path. --ANK + */ skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; sk->data_ready(sk, 0); tcp_delack_estimator(tp); +#if 1 /* This checks for required window updates too. */ + tp->delayed_acks++; + __tcp_ack_snd_check(sk); +#else if (tp->delayed_acks++ == 0) tcp_send_delayed_ack(sk, HZ/2); else tcp_send_ack(sk); +#endif return 0; } } @@ -1409,8 +1465,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_wup, tp->rcv_wnd); } tcp_send_ack(sk); - kfree_skb(skb, FREE_READ); - return 0; + goto discard; } } @@ -1423,10 +1478,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if(th->rst) { tcp_reset(sk,skb); - kfree_skb(skb, FREE_READ); - return 0; + goto discard; } - + if(th->ack) tcp_ack(sk, th, skb->seq, skb->ack_seq, len); @@ -1441,16 +1495,17 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, (void) tcp_fin(skb, sk, th); tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - /* If our receive queue has grown past its limits, - * try to prune away duplicates etc.. - */ + /* If our receive queue has grown past its limits shrink it */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) prune_queue(sk); - if (!queued) - kfree_skb(skb, FREE_READ); + tcp_ack_snd_check(sk); + + if (!queued) { + discard: + kfree_skb(skb); + } return 0; } @@ -1854,8 +1909,12 @@ step6: } } - case TCP_ESTABLISHED: + case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); + + /* This can only happen when MTU+skbheader > rcvbuf */ + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + prune_queue(sk); break; } @@ -1870,7 +1929,7 @@ step6: if (!queued) { discard: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return 0; } @@ -1880,22 +1939,20 @@ int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, { int val = sysctl_tcp_cong_avoidance; int retv; + static tcp_sys_cong_ctl_t tab[] = { + tcp_cong_avoid_vanj, + tcp_cong_avoid_vegas + }; retv = proc_dointvec(ctl, write, filp, buffer, lenp); if (write) { - switch (sysctl_tcp_cong_avoidance) { - case 0: - tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; - break; - case 1: - tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas; - break; - default: + if ((unsigned)sysctl_tcp_cong_avoidance > 1) { retv = -EINVAL; sysctl_tcp_cong_avoidance = val; - }; + } else { + tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance]; + } } - return retv; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8c75bce3e..e4f8981ac 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.76 1997/12/07 04:44:19 freitag Exp $ + * Version: $Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $ * * IPv4 specific functions * @@ -40,7 +40,10 @@ * Added tail drop and some other bugfixes. * Added new listen sematics (ifdefed by * NEW_LISTEN for now) + * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. */ #include <linux/config.h> @@ -48,7 +51,6 @@ #include <linux/fcntl.h> #include <linux/random.h> #include <linux/ipsec.h> -#include <linux/inet.h> #include <net/icmp.h> #include <net/tcp.h> @@ -56,6 +58,8 @@ #include <asm/segment.h> +#include <linux/inet.h> + extern int sysctl_tcp_sack; extern int sysctl_tcp_tsack; extern int sysctl_tcp_timestamps; @@ -171,7 +175,7 @@ static __inline__ int tcp_lport_inuse(int num) return 0; } -/* Find a "good" local port, this is family independant. +/* Find a "good" local port, this is family independent. * There are several strategies working in unison here to * get the best possible performance. The current socket * load is kept track of, if it is zero there is a strong @@ -562,13 +566,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm); } - if (sk->dst_cache) { - dst_release(sk->dst_cache); - sk->dst_cache = NULL; - } + dst_release(xchg(&sk->dst_cache, NULL)); tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - RT_TOS(sk->ip_tos)|(sk->localroute || 0), sk->bound_dev_if); + RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); if (tmp < 0) return tmp; @@ -627,7 +628,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) */ sk->daddr = 0; sk->saddr = sk->rcv_saddr = 0; - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); release_sock(sk); return(-ENETUNREACH); } @@ -648,7 +649,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->mtu = rt->u.dst.pmtu; if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - rt->rt_flags&RTCF_NOPMTUDISC)) && + (rt->u.dst.mxlock&(1<<RTAX_MTU)))) && rt->u.dst.pmtu > 576) sk->mtu = 576; @@ -808,8 +809,11 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) * dropped. This is the new "fast" path mtu * discovery. */ - if (!sk->sock_readers) + if (!sk->sock_readers) { + lock_sock(sk); tcp_simple_retransmit(sk); + release_sock(sk); + } /* else let the usual retransmit timer handle it */ } } } @@ -821,6 +825,12 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. */ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) @@ -864,13 +874,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) switch (type) { case ICMP_SOURCE_QUENCH: +#ifndef OLD_SOURCE_QUENCH /* This is deprecated */ tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh; tp->high_seq = tp->snd_nxt; +#endif return; case ICMP_PARAMETERPROB: sk->err=EPROTO; - sk->error_report(sk); + sk->error_report(sk); /* This isn't serialized on SMP! */ break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ @@ -900,7 +912,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) */ return; } - + if (!th->syn && !th->ack) return; req = tcp_v4_search_req(tp, iph, th, &prev); @@ -930,6 +942,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } if(icmp_err_convert[code].fatal || opening) { + /* This code isn't serialized with the socket code */ sk->err = icmp_err_convert[code].errno; if (opening) { tcp_statistics.TcpAttemptFails++; @@ -1043,7 +1056,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } @@ -1068,7 +1081,12 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; + th->source = +#ifdef CONFIG_IP_TRANSPARENT_PROXY + req->lcl_port; /* LVE */ +#else th->source = sk->dummy_th.source; +#endif th->dest = req->rmt_port; skb->seq = req->snt_isn; skb->end_seq = skb->seq + 1; @@ -1110,8 +1128,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) static void tcp_v4_or_free(struct open_request *req) { if(!req->sk && req->af.v4_req.opt) - kfree_s(req->af.v4_req.opt, - sizeof(struct ip_options) + req->af.v4_req.opt->optlen); + kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt)); } static inline void syn_flood_warning(struct sk_buff *skb) @@ -1126,6 +1143,28 @@ static inline void syn_flood_warning(struct sk_buff *skb) } } +/* + * Save and compile IPv4 options into the open_request if needed. + */ +static inline struct ip_options * +tcp_v4_save_options(struct sock *sk, struct sk_buff *skb, + struct ip_options *opt) +{ + struct ip_options *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = optlength(opt); + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(dopt, skb)) { + kfree_s(dopt, opt_size); + dopt = NULL; + } + } + } + return dopt; +} + int sysctl_max_syn_backlog = 1024; int sysctl_tcp_syn_taildrop = 1; @@ -1146,7 +1185,6 @@ struct or_calltable or_ipv4 = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 isn) { - struct ip_options *opt = (struct ip_options *) ptr; struct tcp_opt tp; struct open_request *req; struct tcphdr *th = skb->h.th; @@ -1205,6 +1243,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = th->source; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + req->lcl_port = th->dest ; /* LVE */ +#endif req->af.v4_req.loc_addr = daddr; req->af.v4_req.rmt_addr = saddr; @@ -1216,20 +1257,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->snt_isn = isn; - /* IPv4 options */ - req->af.v4_req.opt = NULL; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb, ptr); - if (opt && opt->optlen) { - int opt_size = sizeof(struct ip_options) + opt->optlen; - - req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); - if (req->af.v4_req.opt) { - if (ip_options_echo(req->af.v4_req.opt, skb)) { - kfree_s(req->af.v4_req.opt, opt_size); - req->af.v4_req.opt = NULL; - } - } - } req->class = &or_ipv4; req->retrans = 0; req->sk = NULL; @@ -1237,26 +1266,27 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, tcp_v4_send_synack(sk, req); if (want_cookie) { - if (req->af.v4_req.opt) - kfree(req->af.v4_req.opt); + if (req->af.v4_req.opt) + kfree(req->af.v4_req.opt); + tcp_v4_or_free(req); tcp_openreq_free(req); - } else { + } else { req->expires = jiffies + TCP_TIMEOUT_INIT; tcp_inc_slow_timer(TCP_SLT_SYNACK); tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); } sk->data_ready(sk, 0); -exit: return 0; dead: SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk); tcp_statistics.TcpAttemptFails++; - return -ENOTCONN; + return -ENOTCONN; /* send reset */ + error: tcp_statistics.TcpAttemptFails++; - goto exit; + return 0; } struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, @@ -1282,7 +1312,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Or else we die! -DaveM */ newsk->sklist_next = NULL; - newsk->opt = req->af.v4_req.opt; + newsk->opt = req->af.v4_req.opt; skb_queue_head_init(&newsk->write_queue); skb_queue_head_init(&newsk->receive_queue); @@ -1338,7 +1368,12 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_init_xmit_timers(newsk); - newsk->dummy_th.source = sk->dummy_th.source; + newsk->dummy_th.source = +#ifdef CONFIG_IP_TRANSPARENT_PROXY + req->lcl_port; /* LVE */ +#else + sk->dummy_th.source; +#endif newsk->dummy_th.dest = req->rmt_port; newsk->sock_readers=0; @@ -1348,6 +1383,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->socket = NULL; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* + * Deal with possibly redirected traffic by setting num to + * the intended destination port of the received packet. + */ + newsk->num = ntohs(skb->h.th->dest); +#endif newsk->daddr = req->af.v4_req.rmt_addr; newsk->saddr = req->af.v4_req.loc_addr; newsk->rcv_saddr = req->af.v4_req.loc_addr; @@ -1359,7 +1401,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (ip_route_output(&rt, newsk->opt && newsk->opt->srr ? newsk->opt->faddr : newsk->daddr, - newsk->saddr, newsk->ip_tos, 0)) { + newsk->saddr, newsk->ip_tos|RTO_CONN, 0)) { sk_free(newsk); return NULL; } @@ -1467,7 +1509,13 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { - skb_set_owner_r(skb, sk); +#ifdef CONFIG_FILTER + if (sk->filter) + { + if (sk_filter(skb, sk->filter_data, sk->filter)) + goto discard; + } +#endif /* CONFIG_FILTER */ /* * socket locking is here for SMP purposes as backlog rcv @@ -1475,6 +1523,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) */ lock_sock(sk); + /* + * This doesn't check if the socket has enough room for the packet. + * Either process the packet _without_ queueing it and then free it, + * or do the check later. + */ + skb_set_owner_r(skb, sk); + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; @@ -1494,8 +1549,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk = nsk; } - if (tcp_rcv_state_process(sk, skb, skb->h.th, - &(IPCB(skb)->opt), skb->len)) + if (tcp_rcv_state_process(sk, skb, skb->h.th, &(IPCB(skb)->opt), skb->len)) goto reset; release_sock(sk); return 0; @@ -1503,7 +1557,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: tcp_v4_send_reset(skb); discard: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -1580,7 +1634,7 @@ no_tcp_socket: discard_it: /* Discard frame. */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1602,13 +1656,17 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) rt = (struct rtable*)skb->dst; /* Force route checking if want_rewrite */ + /* The idea is good, the implementation is disguisting. + Well, if I made bind on this socket, you cannot randomly ovewrite + its source address. --ANK + */ if (want_rewrite) { int tmp; __u32 old_saddr = rt->rt_src; /* Query new route */ tmp = ip_route_connect(&rt, rt->rt_dst, 0, - RT_TOS(sk->ip_tos)|(sk->localroute||0), + RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); /* Only useful if different source addrs */ @@ -1622,7 +1680,7 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) } else if (rt->u.dst.obsolete) { int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.oif); + err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); if (err) { sk->err_soft=-err; sk->error_report(skb->sk); @@ -1632,9 +1690,6 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) skb->dst = &rt->u.dst; } - /* Discard the surplus MAC header. */ - skb_pull(skb, skb->nh.raw-skb->data); - iph = skb->nh.iph; th = skb->h.th; size = skb->tail - skb->h.raw; @@ -1778,11 +1833,11 @@ static int tcp_v4_destroy_sock(struct sock *sk) /* Cleanup up the write buffer. */ while((skb = skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); /* Cleans up our, hopefuly empty, out_of_order_queue. */ while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9ffb1517..fbae5cfa6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.50 1997/10/15 19:13:02 freitag Exp $ + * Version: $Id: tcp_output.c,v 1.51 1998/01/15 22:40:39 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -87,40 +87,12 @@ static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) tp->retransmits == 0); } -static __inline__ void tcp_build_options(__u32 *ptr, struct tcp_opt *tp) -{ - /* FIXME: We will still need to do SACK here. */ - if (tp->tstamp_ok) { - *ptr++ = ntohl((TCPOPT_NOP << 24) - | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP); - /* WARNING: If HZ is ever larger than 1000 on some system, - * then we will be violating RFC1323 here because our timestamps - * will be moving too fast. - * FIXME: code TCP so it uses at most ~ 1000 ticks a second? - * (I notice alpha is 1024 ticks now). -- erics - */ - *ptr++ = htonl(jiffies); - *ptr = htonl(tp->ts_recent); - } -} - -static __inline__ void tcp_update_options(__u32 *ptr, struct tcp_opt *tp) -{ - /* FIXME: We will still need to do SACK here. */ - if (tp->tstamp_ok) { - *++ptr = htonl(jiffies); - *++ptr = htonl(tp->ts_recent); - } -} - /* * This is the main buffer sending routine. We queue the buffer * having checked it is sane seeming. */ -int tcp_send_skb(struct sock *sk, struct sk_buff *skb) +void tcp_send_skb(struct sock *sk, struct sk_buff *skb) { struct tcphdr * th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -134,8 +106,8 @@ int tcp_send_skb(struct sock *sk, struct sk_buff *skb) printk(KERN_DEBUG "tcp_send_skb: bad skb " "(skb = %p, data = %p, th = %p, len = %u)\n", skb, skb->data, th, skb->len); - kfree_skb(skb, FREE_WRITE); - return 0; + kfree_skb(skb); + return; } /* If we have queued a header size packet.. (these crash a few @@ -146,8 +118,8 @@ int tcp_send_skb(struct sock *sk, struct sk_buff *skb) /* If it's got a syn or fin discard. */ if(!th->syn && !th->fin) { printk(KERN_DEBUG "tcp_send_skb: attempt to queue a bogon.\n"); - kfree_skb(skb,FREE_WRITE); - return 0; + kfree_skb(skb); + return; } } @@ -161,7 +133,8 @@ int tcp_send_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff * buff; /* This is going straight out. */ - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); + tp->last_ack_sent = tp->rcv_nxt; + th->ack_seq = htonl(tp->rcv_nxt); th->window = htons(tcp_select_window(sk)); tcp_update_options((__u32 *)(th+1),tp); @@ -185,7 +158,7 @@ int tcp_send_skb(struct sock *sk, struct sk_buff *skb) if (!tcp_timer_is_set(sk, TIME_RETRANS)) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - return 0; + return; } queue: @@ -196,7 +169,7 @@ queue: tp->pending = TIME_PROBE0; tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } - return 0; + return; } /* @@ -232,7 +205,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Put headers on the new packet. */ tmp = tp->af_specific->build_net_header(sk, buff); if (tmp < 0) { - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); return -1; } @@ -290,7 +263,7 @@ static void tcp_wrxmit_prob(struct sock *sk, struct sk_buff *skb) update_send_head(sk); skb_unlink(skb); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (!sk->dead) sk->write_space(sk); @@ -468,7 +441,7 @@ unsigned short tcp_select_window(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int mss = sk->mss; - long free_space = sock_rspace(sk)/2; + long free_space = sock_rspace(sk) / 2; long window, cur_win; if (tp->window_clamp) { @@ -624,7 +597,7 @@ static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) th1->fin = 1; /* ... and off you go. */ - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); tp->packets_out--; /* Header checksum will be set by the retransmit procedure @@ -714,7 +687,7 @@ void tcp_do_retransmit(struct sock *sk, int all) break; } - SOCK_DEBUG(sk, "retransmit sending\n"); + SOCK_DEBUG(sk, "retransmit sending seq=%x\n", skb->seq); /* Update ack and window. */ tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); @@ -786,7 +759,7 @@ void tcp_send_fin(struct sock *sk) /* FIXME: We must not throw this out. Eventually we must * put a FIN into the queue, otherwise it never gets queued. */ - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); sk->write_seq++; t = del_timer(&sk->timer); if (t) @@ -817,6 +790,9 @@ void tcp_send_fin(struct sock *sk) /* The fin can only be transmited after the data. */ skb_queue_tail(&sk->write_queue, buff); if (tp->send_head == NULL) { + /* FIXME: BUG! we need to check if the fin fits into the window + * here. If not we need to do window probing (sick, but true) + */ struct sk_buff *skb1; tp->packets_out++; @@ -853,7 +829,7 @@ int tcp_send_synack(struct sock *sk) tmp = tp->af_specific->build_net_header(sk, skb); if (tmp < 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return tmp; } @@ -974,7 +950,7 @@ void tcp_send_ack(struct sock *sk) /* Put in the IP header and routing stuff. */ tmp = tp->af_specific->build_net_header(sk, buff); if (tmp < 0) { - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); return; } @@ -985,13 +961,16 @@ void tcp_send_ack(struct sock *sk) /* Swap the send and the receive. */ th->window = ntohs(tcp_select_window(sk)); th->seq = ntohl(tp->snd_nxt); - tp->last_ack_sent = th->ack_seq = ntohl(tp->rcv_nxt); + tp->last_ack_sent = tp->rcv_nxt; + th->ack_seq = htonl(tp->rcv_nxt); /* Fill in the packet and send it. */ tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff); +#if 0 SOCK_DEBUG(sk, "\rtcp_send_ack: seq %x ack %x\n", tp->snd_nxt, tp->rcv_nxt); +#endif tp->af_specific->queue_xmit(buff); tcp_statistics.TcpOutSegs++; @@ -1064,7 +1043,7 @@ void tcp_write_wakeup(struct sock *sk) /* Put in the IP header and routing stuff. */ tmp = tp->af_specific->build_net_header(sk, buff); if (tmp < 0) { - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); return; } @@ -1104,9 +1083,6 @@ void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (sk->zapped) - return; /* After a valid reset we can send no more. */ - tcp_write_wakeup(sk); tp->pending = TIME_PROBE0; tp->backoff++; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1d804a864..76ccedab2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.4 1997/12/16 05:37:48 ralf Exp $ + * Version: $Id: tcp_timer.c,v 1.5 1998/03/03 01:23:44 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -156,10 +156,7 @@ static int tcp_write_timeout(struct sock *sk) if ((sk->state == TCP_ESTABLISHED && tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { - /* Attempt to recover if arp has changed (unlikely!) or - * a route has shifted (not supported prior to 1.3). - */ - ip_rt_advice((struct rtable**)&sk->dst_cache, 0); + dst_negative_advice(&sk->dst_cache); } /* Have we tried to SYN too many times (repent repent 8)) */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 84586867f..f355caa85 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.2 1997/12/16 05:37:48 ralf Exp $ + * Version: $Id: udp.c,v 1.3 1998/03/03 01:23:44 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -49,12 +49,14 @@ * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. * * * This program is free software; you can redistribute it and/or @@ -360,14 +362,14 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport #ifdef CONFIG_IP_TRANSPARENT_PROXY #define secondlist(hpnum, sk, fpass) \ ({ struct sock *s1; if(!(sk) && (fpass)--) \ - s1 = udp_hash[(hpnum) & (TCP_HTABLE_SIZE - 1)]; \ + s1 = udp_hash[(hpnum) & (UDP_HTABLE_SIZE - 1)]; \ else \ s1 = (sk); \ s1; \ }) #define udp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \ - secondlist((hpnum), udp_hash[(hnum)&(TCP_HTABLE_SIZE-1)],(fpass)) + secondlist((hpnum), udp_hash[(hnum)&(UDP_HTABLE_SIZE-1)],(fpass)) #define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ secondlist((hpnum),(sk)->next,(fpass)) @@ -492,7 +494,7 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) if (sk->ip_recverr && !sk->sock_readers) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && sock_queue_err_skb(sk, skb2)) - kfree_skb(skb2, FREE_READ); + kfree_skb(skb2); } switch (type) { @@ -620,7 +622,18 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) u8 tos; int err; - if (len>65535) + /* This check is ONLY to check for arithmetic overflow + on integer(!) len. Not more! Real check will be made + in ip_build_xmit --ANK + + BTW socket.c -> af_*.c -> ... make multiple + invalid conversions size_t -> int. We MUST repair it f.e. + by replacing all of them with size_t and revise all + the places sort of len += sizeof(struct iphdr) + If len was ULONG_MAX-10 it would be cathastrophe --ANK + */ + + if (len < 0 || len > 0xFFFF) return -EMSGSIZE; /* @@ -630,9 +643,15 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY)) + return -EINVAL; + if ((msg->msg_flags&MSG_PROXY) && !suser() ) + return -EPERM; +#else if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT)) return -EINVAL; - +#endif /* * Get and verify the address. @@ -653,16 +672,49 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.uh.dest = usin->sin_port; if (ufh.uh.dest == 0) return -EINVAL; - /* XXX: is a one-behind cache for the dst_entry worth it? */ + /* XXX: is a one-behind cache for the dst_entry worth it? + + Nope. ip_route_output is slower than nothing, but it + is enough fast to forget about caching its results. + Really, checking route validity in general case + is not much faster complete lookup. + It was main reason why I removed it from 2.1. + The second reason was that idle sockets held + a lot of stray destinations. --ANK + */ } else { if (sk->state != TCP_ESTABLISHED) return -EINVAL; ufh.daddr = sk->daddr; ufh.uh.dest = sk->dummy_th.dest; - rt = (struct rtable *)sk->dst_cache; + + /* + BUGGG Khm... And who will validate it? Fixing it fastly... + --ANK + */ + rt = (struct rtable *)dst_check(&sk->dst_cache, 0); } +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (msg->msg_flags&MSG_PROXY) { + /* + * We map the first 8 bytes of a second sockaddr_in + * into the last 8 (unused) bytes of a sockaddr_in. + */ + struct sockaddr_in *from = (struct sockaddr_in *)msg->msg_name; + from = (struct sockaddr_in *)&from->sin_zero; + if (from->sin_family != AF_INET) + return -EINVAL; + ipc.addr = from->sin_addr.s_addr; + ufh.uh.source = from->sin_port; + if (ipc.addr == 0) + ipc.addr = sk->saddr; + } else +#endif + { + ipc.addr = sk->saddr; + ufh.uh.source = sk->dummy_th.source; + } - ipc.addr = sk->saddr; ipc.opt = NULL; ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { @@ -686,10 +738,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) tos = RT_TOS(sk->ip_tos); if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || (ipc.opt && ipc.opt->is_strictroute)) { - tos |= 1; + tos |= RTO_ONLINK; rt = NULL; /* sorry */ } - + if (MULTICAST(daddr)) { if (!ipc.oif) ipc.oif = sk->ip_mc_index; @@ -698,7 +750,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) } if (rt == NULL) { - err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif); + err = ip_route_output(&rt, daddr, ufh.saddr, +#ifdef CONFIG_IP_TRANSPARENT_PROXY + (msg->msg_flags&MSG_PROXY ? RTO_TPROXY : 0) | +#endif + tos, ipc.oif); if (err) goto out; localroute = 1; @@ -711,7 +767,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.saddr = rt->rt_src; if (!ipc.addr) ufh.daddr = ipc.addr = rt->rt_dst; - ufh.uh.source = sk->dummy_th.source; ufh.uh.len = htons(ulen); ufh.uh.check = 0; ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256; @@ -762,8 +817,10 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) struct sk_buff *skb; unsigned long amount; - if (sk->state == TCP_LISTEN) return(-EINVAL); + if (sk->state == TCP_LISTEN) + return(-EINVAL); amount = 0; + /* N.B. Is this interrupt safe?? */ skb = skb_peek(&sk->receive_queue); if (skb != NULL) { /* @@ -777,7 +834,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) } default: - return(-EINVAL); + return(-ENOIOCTLCMD); } return(0); } @@ -789,13 +846,11 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) */ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) + int noblock, int flags, int *addr_len) { - int copied = 0; - int truesize; + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - int er; - struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name; + int copied, err; /* * Check any passed addresses @@ -805,14 +860,12 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, *addr_len=sizeof(*sin); if (sk->ip_recverr && (skb = skb_dequeue(&sk->error_queue)) != NULL) { - er = sock_error(sk); - if (msg->msg_controllen == 0) { - skb_free_datagram(sk, skb); - return er; + err = sock_error(sk); + if (msg->msg_controllen != 0) { + put_cmsg(msg, SOL_IP, IP_RECVERR, skb->len, skb->data); + err = 0; } - put_cmsg(msg, SOL_IP, IP_RECVERR, skb->len, skb->data); - skb_free_datagram(sk, skb); - return 0; + goto out_free; } /* @@ -820,25 +873,25 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, * the finished NET3, it will do _ALL_ the work! */ - skb=skb_recv_datagram(sk,flags,noblock,&er); - if(skb==NULL) - return er; + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; - truesize = skb->len - sizeof(struct udphdr); - copied = truesize; - if (len < truesize) + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { - msg->msg_flags |= MSG_TRUNC; copied = len; + msg->msg_flags |= MSG_TRUNC; } /* * FIXME : should use udp header size info value */ - er = skb_copy_datagram_iovec(skb,sizeof(struct udphdr),msg->msg_iov,copied); - if (er) - return er; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + if (err) + goto out_free; sk->stamp=skb->stamp; /* Copy the address. */ @@ -867,9 +920,12 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, } if (sk->ip_cmsg_flags) ip_cmsg_recv(msg, skb); + err = copied; +out_free: skb_free_datagram(sk, skb); - return(copied); +out: + return err; } int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -900,8 +956,7 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (usin->sin_family && usin->sin_family != AF_INET) return(-EAFNOSUPPORT); - dst_release(sk->dst_cache); - sk->dst_cache = NULL; + dst_release(xchg(&sk->dst_cache, NULL)); err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, sk->ip_tos|sk->localroute, sk->bound_dev_if); @@ -947,7 +1002,7 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if(!ipsec_sk_policy(sk,skb)) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return(0); } @@ -959,7 +1014,7 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) udp_statistics.UdpInErrors++; ip_statistics.IpInDiscards++; ip_statistics.IpInDelivers--; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -1; } udp_statistics.UdpInDatagrams++; @@ -1007,7 +1062,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, } SOCKHASH_UNLOCK(); if(!given) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1070,7 +1125,7 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) if (ulen > len || len < sizeof(*uh) || ulen < sizeof(*uh)) { NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len)); udp_statistics.UdpInErrors++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return(0); } @@ -1089,7 +1144,7 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) ntohl(daddr),ntohs(uh->dest), ulen)); udp_statistics.UdpInErrors++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return(0); } @@ -1125,7 +1180,7 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) * Hmm. We got an UDP broadcast to a port to which we * don't wanna listen. Ignore it. */ - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return(0); } udp_deliver(sk, skb); diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c index 0f463d0ee..3e638d6c8 100644 --- a/net/ipv4/utils.c +++ b/net/ipv4/utils.c @@ -6,7 +6,7 @@ * Various kernel-resident INET utility functions; mainly * for format conversion and debugging output. * - * Version: $Id: utils.c,v 1.5 1997/09/17 18:50:31 freitag Exp $ + * Version: $Id: utils.c,v 1.3 1997/12/16 05:37:49 ralf Exp $ * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * @@ -89,24 +89,3 @@ __u32 in_aton(const char *str) return(htonl(l)); } -/* - * This enforces a rate limit: not more than one kernel message - * every 5secs to make a denial-of-service attack impossible. - * - * All warning printk()s should be guarded by this function. - */ -int net_ratelimit(void) -{ - static unsigned long last_msg; - static int missed; - - if ((jiffies - last_msg) >= 5*HZ) { - if (missed) - printk(KERN_WARNING "ipv4: (%d messages suppressed. Flood?)\n", missed); - missed = 0; - last_msg = jiffies; - return 1; - } - missed++; - return 0; -} diff --git a/net/ipv6/.cvsignore b/net/ipv6/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/ipv6/.cvsignore +++ b/net/ipv6/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/ipv6/Config.in b/net/ipv6/Config.in index f4c84e640..3372817c3 100644 --- a/net/ipv6/Config.in +++ b/net/ipv6/Config.in @@ -2,6 +2,13 @@ # IPv6 configuration # bool 'IPv6: enable EUI-64 token format' CONFIG_IPV6_EUI64 -bool 'IPv6: disable provided based addresses' CONFIG_IPV6_NO_PB +if [ "$CONFIG_IPV6_EUI64" = "y" ]; then + bool 'IPv6: disable provider based addresses' CONFIG_IPV6_NO_PB +fi +if [ "$CONFIG_NETLINK" = "y" ]; then + if [ "$CONFIG_RTNETLINK" = "n" ]; then + bool 'IPv6: routing messages via old netlink' CONFIG_IPV6_NETLINK + fi +fi #bool 'IPv6: flow policy support' CONFIG_RT6_POLICY #bool 'IPv6: firewall support' CONFIG_IPV6_FIREWALL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c66902f13..c4faba4b7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: addrconf.c,v 1.30 1997/12/09 17:12:47 freitag Exp $ + * $Id: addrconf.c,v 1.32 1997/12/27 20:41:18 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -35,6 +35,9 @@ #include <linux/route.h> #include <linux/inetdevice.h> #include <linux/init.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #include <linux/proc_fs.h> #include <net/sock.h> @@ -47,6 +50,7 @@ #include <net/addrconf.h> #include <net/ip.h> #include <linux/if_tunnel.h> +#include <linux/rtnetlink.h> #include <asm/uaccess.h> @@ -59,20 +63,20 @@ #define ADBG(x) #endif -/* - * Configured unicast address list - */ -struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; +#ifdef CONFIG_SYSCTL +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); +static void addrconf_sysctl_unregister(struct ipv6_devconf *p); +#endif /* - * Hash list of configured multicast addresses + * Configured unicast address list */ -struct ifmcaddr6 *inet6_mcast_lst[IN6_ADDR_HSIZE]; +static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; /* * AF_INET6 device list */ -struct inet6_dev *inet6_dev_lst[IN6_ADDR_HSIZE]; +static struct inet6_dev *inet6_dev_lst[IN6_ADDR_HSIZE]; static atomic_t addr_list_lock = ATOMIC_INIT(0); @@ -83,12 +87,41 @@ static struct timer_list addr_chk_timer = { 0, 0, addrconf_verify }; -static int addrconf_ifdown(struct device *dev); +static int addrconf_ifdown(struct device *dev, int how); static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); static void addrconf_rs_timer(unsigned long data); +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); + +struct ipv6_devconf ipv6_devconf = +{ + 0, /* forwarding */ + IPV6_DEFAULT_HOPLIMIT, /* hop limit */ + 576, /* mtu */ + 1, /* accept RAs */ + 1, /* accept redirects */ + 1, /* autoconfiguration */ + 1, /* dad transmits */ + MAX_RTR_SOLICITATIONS, /* router solicits */ + RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */ + MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */ +}; + +static struct ipv6_devconf ipv6_devconf_dflt = +{ + 0, /* forwarding */ + IPV6_DEFAULT_HOPLIMIT, /* hop limit */ + 576, /* mtu */ + 1, /* accept RAs */ + 1, /* accept redirects */ + 1, /* autoconfiguration */ + 1, /* dad transmits */ + MAX_RTR_SOLICITATIONS, /* router solicits */ + RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */ + MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */ +}; int ipv6_addr_type(struct in6_addr *addr) { @@ -151,12 +184,27 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev) struct inet6_dev *ndev, **bptr, *iter; int hash; + if (dev->mtu < 576) + return NULL; + ndev = kmalloc(sizeof(struct inet6_dev), gfp_any()); if (ndev) { memset(ndev, 0, sizeof(struct inet6_dev)); ndev->dev = dev; + memcpy(&ndev->cnf, &ipv6_devconf_dflt, sizeof(ndev->cnf)); + ndev->cnf.mtu6 = dev->mtu; + ndev->cnf.sysctl = NULL; + ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); + if (ndev->nd_parms == NULL) { + kfree(ndev); + return NULL; + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, ndev->nd_parms, NET_IPV6, NET_IPV6_NEIGH, "ipv6"); + addrconf_sysctl_register(ndev, &ndev->cnf); +#endif hash = ipv6_devindex_hash(dev->ifindex); bptr = &inet6_dev_lst[hash]; iter = *bptr; @@ -165,34 +213,35 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev) bptr = &iter->next; *bptr = ndev; + } return ndev; } -void addrconf_forwarding_on(void) +static struct inet6_dev * ipv6_find_idev(struct device *dev) { struct inet6_dev *idev; - int i; - for (i = 0; i < IN6_ADDR_HSIZE; i++) { - for (idev = inet6_dev_lst[i]; idev; idev = idev->next) { -#if ACONF_DEBUG >= 2 - printk(KERN_DEBUG "dev %s\n", idev->dev->name); -#endif + if ((idev = ipv6_get_idev(dev)) == NULL) { + idev = ipv6_add_dev(dev); + if (idev == NULL) + return NULL; + } + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); + return idev; +} - if (idev->dev->type == ARPHRD_ETHER) { - struct in6_addr maddr; +static void addrconf_forward_change(struct inet6_dev *idev) +{ + int i; -#if ACONF_DEBUG >= 2 - printk(KERN_DEBUG "joining all-routers\n"); -#endif - idev->router = 1; + if (idev) + return; - /* Wrong. It is user level function. */ - ipv6_addr_all_routers(&maddr); - ipv6_dev_mc_inc(idev->dev, &maddr); - } - } + for (i = 0; i < IN6_ADDR_HSIZE; i++) { + for (idev = inet6_dev_lst[i]; idev; idev = idev->next) + idev->cnf.forwarding = ipv6_devconf.forwarding; } } @@ -244,11 +293,13 @@ struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, return ifa; } -void ipv6_del_addr(struct inet6_ifaddr *ifp) +static void ipv6_del_addr(struct inet6_ifaddr *ifp) { struct inet6_ifaddr *iter, **back; int hash; + ipv6_ifa_notify(RTM_DELADDR, ifp); + if (atomic_read(&addr_list_lock)) { ifp->flags |= ADDR_INVALID; return; @@ -399,33 +450,75 @@ struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev) * to the host. */ -struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr) +struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr, struct device *dev, int nd) { struct inet6_ifaddr * ifp; u8 hash; + unsigned flags = 0; + + if (!nd) + flags |= DAD_STATUS|ADDR_INVALID; atomic_inc(&addr_list_lock); hash = ipv6_addr_hash(addr); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ipv6_addr_cmp(&ifp->addr, addr) == 0) - break; + if (ipv6_addr_cmp(&ifp->addr, addr) == 0 && !(ifp->flags&flags)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST))) + break; + } } atomic_dec(&addr_list_lock); - return ifp; + return ifp; +} + +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + del_timer(&ifp->timer); + ipv6_del_addr(ifp); } + /* Join to solicited addr multicast group. */ static void addrconf_join_solict(struct device *dev, struct in6_addr *addr) { struct in6_addr maddr; - addrconf_addr_solict_mult(addr, &maddr); + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +#endif +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); +#endif } +static void addrconf_leave_solict(struct device *dev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +#endif +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +#endif +} + + #ifdef CONFIG_IPV6_EUI64 static int ipv6_generate_eui64(u8 *eui, struct device *dev) { @@ -462,6 +555,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, rtmsg.rtmsg_ifindex = dev->ifindex; rtmsg.rtmsg_info = info; rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; /* Prevent useless cloning on PtP SIT. This thing is done here expecting that the whole @@ -469,12 +563,8 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, */ if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) rtmsg.rtmsg_flags |= RTF_NONEXTHOP; - rtmsg.rtmsg_type = RTMSG_NEWROUTE; ip6_route_add(&rtmsg, &err); - - if (err) - printk(KERN_DEBUG "IPv6: error %d adding prefix route\n", err); } /* Create "default" multicast route to the interface */ @@ -482,7 +572,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, static void addrconf_add_mroute(struct device *dev) { struct in6_rtmsg rtmsg; - struct rt6_info *rt; int err; memset(&rtmsg, 0, sizeof(rtmsg)); @@ -493,25 +582,12 @@ static void addrconf_add_mroute(struct device *dev) rtmsg.rtmsg_ifindex = dev->ifindex; rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; rtmsg.rtmsg_type = RTMSG_NEWROUTE; - - rt = ip6_route_add(&rtmsg, &err); - - /* - * Pedro makes interesting thing here, he attached - * fake nexthop to multicast route. - * It is trick to avoid cloning, ugly, but efficient. --ANK - */ - - if (err) - printk(KERN_DEBUG "IPv6: error %d adding mroute\n", err); - else - rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); + ip6_route_add(&rtmsg, &err); } static void sit_route_add(struct device *dev) { struct in6_rtmsg rtmsg; - struct rt6_info *rt; int err; memset(&rtmsg, 0, sizeof(rtmsg)); @@ -521,19 +597,10 @@ static void sit_route_add(struct device *dev) /* prefix length - 96 bytes "::d.d.d.d" */ rtmsg.rtmsg_dst_len = 96; - rtmsg.rtmsg_flags = RTF_UP; + rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; rtmsg.rtmsg_ifindex = dev->ifindex; - rt = ip6_route_add(&rtmsg, &err); - - /* See comment in addrconf_add_mroute. - * It is the same trick, but to avoid cloning for direct - * sit routes i.e. IPv4 comaptible destinations. - */ - if (err) - printk(KERN_DEBUG "sit_route_add: error %d in route_add\n", err); - else - rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); + ip6_route_add(&rtmsg, &err); } static void addrconf_add_lroute(struct device *dev) @@ -546,24 +613,16 @@ static void addrconf_add_lroute(struct device *dev) static struct inet6_dev *addrconf_add_dev(struct device *dev) { - struct in6_addr maddr; struct inet6_dev *idev; - if ((idev = ipv6_get_idev(dev)) == NULL) { - idev = ipv6_add_dev(dev); - if (idev == NULL) - return NULL; - } + if ((idev = ipv6_find_idev(dev)) == NULL) + return NULL; /* Add default multicast route */ addrconf_add_mroute(dev); /* Add link local route */ addrconf_add_lroute(dev); - - /* Join to all nodes multicast group. */ - ipv6_addr_all_nodes(&maddr); - ipv6_dev_mc_inc(dev, &maddr); return idev; } @@ -575,6 +634,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) __u32 prefered_lft; int addr_type; unsigned long rt_expires; + struct inet6_dev *in6_dev = ipv6_get_idev(dev); + + if (in6_dev == NULL) { + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } pinfo = (struct prefix_info *) opt; @@ -613,9 +678,15 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) * 2) Configure prefixes with the auto flag set */ - rt_expires = jiffies + valid_lft * HZ; - if (rt_expires < jiffies) - rt_expires = ~0; + /* Avoid arithemtic overflow. Really, we could + save rt_expires in seconds, likely valid_lft, + but it would require division in fib gc, that it + not good. + */ + if (valid_lft >= 0x7FFFFFFF/HZ) + rt_expires = 0; + else + rt_expires = jiffies + valid_lft * HZ; rt = rt6_lookup(&pinfo->prefix, NULL, dev, RTF_LINKRT); @@ -633,7 +704,7 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) /* Try to figure out our local address for this prefix */ - if (pinfo->autoconf && ipv6_config.autoconf) { + if (pinfo->autoconf && in6_dev->cnf.autoconf) { struct inet6_ifaddr * ifp; struct in6_addr addr; int plen; @@ -660,18 +731,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) return; ok: - ifp = ipv6_chk_addr(&addr); + ifp = ipv6_chk_addr(&addr, dev, 1); - if (ifp == NULL && valid_lft) { - struct inet6_dev *in6_dev = ipv6_get_idev(dev); + if ((ifp == NULL || (ifp->flags&ADDR_INVALID)) && valid_lft) { - if (in6_dev == NULL) { - printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); - return; - } - - ifp = ipv6_add_addr(in6_dev, &addr, - addr_type & IPV6_ADDR_SCOPE_MASK); + if (ifp == NULL) + ifp = ipv6_add_addr(in6_dev, &addr, addr_type & IPV6_ADDR_SCOPE_MASK); if (ifp == NULL) return; @@ -687,9 +752,14 @@ ok: } if (ifp) { + int event = 0; ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = jiffies; + if (ifp->flags & ADDR_INVALID) + event = RTM_NEWADDR; + ifp->flags &= ~(ADDR_DEPRECATED|ADDR_INVALID); + ipv6_ifa_notify(event, ifp); } } } @@ -705,25 +775,26 @@ int addrconf_set_dstaddr(void *arg) struct device *dev; int err = -EINVAL; - if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) { - err = -EFAULT; + rtnl_lock(); + + err = -EFAULT; + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) goto err_exit; - } dev = dev_get_by_index(ireq.ifr6_ifindex); - if (dev == NULL) { - err = -ENODEV; + err = -ENODEV; + if (dev == NULL) goto err_exit; - } if (dev->type == ARPHRD_SIT) { struct ifreq ifr; mm_segment_t oldfs; struct ip_tunnel_parm p; + err = -EADDRNOTAVAIL; if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) - return -EADDRNOTAVAIL; + goto err_exit; memset(&p, 0, sizeof(p)); p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; @@ -747,27 +818,21 @@ int addrconf_set_dstaddr(void *arg) } err_exit: + rtnl_unlock(); return err; } /* * Manual configuration of address on an interface */ -int addrconf_add_ifaddr(void *arg) +static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) { - struct inet6_dev *idev; - struct in6_ifreq ireq; struct inet6_ifaddr *ifp; + struct inet6_dev *idev; struct device *dev; int scope; - if (!suser()) - return -EPERM; - - if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - return -EFAULT; - - if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) + if ((dev = dev_get_by_index(ifindex)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) @@ -776,49 +841,83 @@ int addrconf_add_ifaddr(void *arg) if ((idev = addrconf_add_dev(dev)) == NULL) return -ENOBUFS; - scope = ipv6_addr_scope(&ireq.ifr6_addr); + scope = ipv6_addr_scope(pfx); - if((ifp = ipv6_add_addr(idev, &ireq.ifr6_addr, scope)) == NULL) + if ((ifp = ipv6_add_addr(idev, pfx, scope)) == NULL) return -ENOMEM; - ifp->prefix_len = ireq.ifr6_prefixlen; + ifp->prefix_len = plen; ifp->flags |= ADDR_PERMANENT; addrconf_dad_start(ifp); return 0; } -int addrconf_del_ifaddr(void *arg) +static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) { - struct in6_ifreq ireq; struct inet6_ifaddr *ifp; + struct inet6_dev *idev; struct device *dev; int scope; - struct inet6_dev *idev; - - if (!suser()) - return -EPERM; - if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - return -EFAULT; - - if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) + if ((dev = dev_get_by_index(ifindex)) == NULL) return -ENODEV; if ((idev = ipv6_get_idev(dev)) == NULL) return -ENXIO; - scope = ipv6_addr_scope(&ireq.ifr6_addr); + scope = ipv6_addr_scope(pfx); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope && - (!memcmp(&ireq.ifr6_addr, &ifp->addr, sizeof(struct in6_addr)))) { + for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == scope && ifp->prefix_len == plen && + (!memcmp(pfx, &ifp->addr, sizeof(struct in6_addr)))) { ipv6_del_addr(ifp); - break; + + /* If the last address is deleted administratively, + disable IPv6 on this interface. + */ + + if (idev->addr_list == NULL) + addrconf_ifdown(idev->dev, 1); + return 0; } } + return -EADDRNOTAVAIL; +} - return 0; + +int addrconf_add_ifaddr(void *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!suser()) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +int addrconf_del_ifaddr(void *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!suser()) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_del(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; } static void sit_add_v4_addrs(struct inet6_dev *idev) @@ -843,7 +942,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (ifp) { ifp->flags |= ADDR_PERMANENT; ifp->prefix_len = 128; - ip6_rt_addr_add(&ifp->addr, idev->dev); + ipv6_ifa_notify(RTM_NEWADDR, ifp); } return; } @@ -876,7 +975,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) else ifp->prefix_len = 96; ifp->flags |= ADDR_PERMANENT; - ip6_rt_addr_add(&ifp->addr, dev); + ipv6_ifa_notify(RTM_NEWADDR, ifp); } } } @@ -887,16 +986,13 @@ static void init_loopback(struct device *dev) struct in6_addr addr; struct inet6_dev *idev; struct inet6_ifaddr * ifp; - int err; /* ::1 */ memset(&addr, 0, sizeof(struct in6_addr)); addr.s6_addr[15] = 1; - idev = ipv6_add_dev(dev); - - if (idev == NULL) { + if ((idev = ipv6_find_idev(dev)) == NULL) { printk(KERN_DEBUG "init loopback: add_dev failed\n"); return; } @@ -909,10 +1005,9 @@ static void init_loopback(struct device *dev) } ifp->flags |= ADDR_PERMANENT; + ifp->prefix_len = 128; - err = ip6_rt_addr_add(&addr, dev); - if (err) - printk(KERN_DEBUG "init_loopback: error in route_add\n"); + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) @@ -932,7 +1027,6 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr static void addrconf_dev_config(struct device *dev) { struct in6_addr addr; - struct in6_addr maddr; struct inet6_dev * idev; if (dev->type != ARPHRD_ETHER) { @@ -964,17 +1058,6 @@ static void addrconf_dev_config(struct device *dev) dev->dev_addr, dev->addr_len); addrconf_add_linklocal(idev, &addr); #endif - - if (ipv6_config.forwarding) { - idev->router = 1; - - /* It is wrong. - It is routing daemon or radvd that must make it, - rather than kernel. - */ - ipv6_addr_all_routers(&maddr); - ipv6_dev_mc_inc(dev, &maddr); - } } static void addrconf_sit_config(struct device *dev) @@ -987,8 +1070,7 @@ static void addrconf_sit_config(struct device *dev) * our v4 addrs in the tunnel */ - idev = ipv6_add_dev(dev); - if (idev == NULL) { + if ((idev = ipv6_find_idev(dev)) == NULL) { printk(KERN_DEBUG "init sit: add_dev failed\n"); return; } @@ -1026,78 +1108,99 @@ int addrconf_notify(struct notifier_block *this, unsigned long event, break; }; +#ifdef CONFIG_IPV6_NETLINK rt6_sndmsg(RTMSG_NEWDEVICE, NULL, NULL, NULL, dev, 0, 0, 0, 0); +#endif break; + case NETDEV_CHANGEMTU: + /* BUGGG... Should scan FIB to change pmtu on routes. --ANK */ + if (dev->mtu >= 576) + break; + + /* MTU falled under 576. Stop IPv6 on this interface. */ + case NETDEV_DOWN: + case NETDEV_UNREGISTER: /* - * Remove all addresses from this interface - * and take the interface out of the list. + * Remove all addresses from this interface. */ - if (addrconf_ifdown(dev) == 0) { -#if 0 - rt6_ifdown(dev); -#endif + if (addrconf_ifdown(dev, event != NETDEV_DOWN) == 0) { +#ifdef CONFIG_IPV6_NETLINK rt6_sndmsg(RTMSG_DELDEVICE, NULL, NULL, NULL, dev, 0, 0, 0, 0); +#endif } break; + case NETDEV_CHANGE: + break; }; - + return NOTIFY_OK; } -static int addrconf_ifdown(struct device *dev) +static int addrconf_ifdown(struct device *dev, int how) { struct inet6_dev *idev, **bidev; struct inet6_ifaddr *ifa, **bifa; int i, hash; - start_bh_atomic(); + rt6_ifdown(dev); + neigh_ifdown(&nd_tbl, dev); - hash = ipv6_devindex_hash(dev->ifindex); - bidev = &inet6_dev_lst[hash]; + idev = ipv6_get_idev(dev); + if (idev == NULL) + return -ENODEV; - for (idev = inet6_dev_lst[hash]; idev; idev = idev->next) { - if (idev->dev == dev) { - *bidev = idev->next; - break; - } - bidev = &idev->next; - } + start_bh_atomic(); - if (idev == NULL) { - end_bh_atomic(); + /* Discard multicast list */ - printk(KERN_DEBUG "addrconf_ifdown: invalid device %p\n",dev); - return -ENODEV; - } + if (how == 1) + ipv6_mc_destroy_dev(idev); + else + ipv6_mc_down(idev); - /* - * FIXME: clear multicast group membership - */ + /* Discard address list */ + + idev->addr_list = NULL; /* - * clean addr_list + * Clean addresses hash table */ for (i=0; i<16; i++) { bifa = &inet6_addr_lst[i]; - for (ifa=inet6_addr_lst[i]; ifa; ) { + while ((ifa = *bifa) != NULL) { if (ifa->idev == idev) { *bifa = ifa->lst_next; del_timer(&ifa->timer); + ipv6_ifa_notify(RTM_DELADDR, ifa); kfree(ifa); - ifa = *bifa; continue; } bifa = &ifa->lst_next; - ifa = *bifa; } } - kfree(idev); + /* Delete device from device hash table (if unregistered) */ + + if (how == 1) { + hash = ipv6_devindex_hash(dev->ifindex); + + for (bidev = &inet6_dev_lst[hash]; (idev=*bidev) != NULL; bidev = &idev->next) { + if (idev->dev == dev) { + *bidev = idev->next; + neigh_parms_release(&nd_tbl, idev->nd_parms); +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&idev->cnf); +#endif + kfree(idev); + break; + } + } + } end_bh_atomic(); return 0; } @@ -1109,7 +1212,7 @@ static void addrconf_rs_timer(unsigned long data) ifp = (struct inet6_ifaddr *) data; - if (ipv6_config.forwarding) + if (ifp->idev->cnf.forwarding) return; if (ifp->idev->if_flags & IF_RA_RCVD) { @@ -1120,19 +1223,16 @@ static void addrconf_rs_timer(unsigned long data) return; } - if (ifp->probes++ <= ipv6_config.rtr_solicits) { + if (ifp->probes++ <= ifp->idev->cnf.rtr_solicits) { struct in6_addr all_routers; - ipv6_addr_set(&all_routers, - __constant_htonl(0xff020000U), 0, 0, - __constant_htonl(0x2U)); + ipv6_addr_all_routers(&all_routers); - ndisc_send_rs(ifp->idev->dev, &ifp->addr, - &all_routers); + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); ifp->timer.function = addrconf_rs_timer; ifp->timer.expires = (jiffies + - ipv6_config.rtr_solicit_interval); + ifp->idev->cnf.rtr_solicit_interval); add_timer(&ifp->timer); } else { struct in6_rtmsg rtmsg; @@ -1158,7 +1258,6 @@ static void addrconf_rs_timer(unsigned long data) */ static void addrconf_dad_start(struct inet6_ifaddr *ifp) { - static int rand_seed = 1; struct device *dev; unsigned long rand_num; @@ -1177,15 +1276,12 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) return; } - if (rand_seed) { - rand_seed = 0; - nd_rand_seed = ifp->addr.s6_addr32[3]; - } + net_srandom(ifp->addr.s6_addr32[3]); - ifp->probes = ipv6_config.dad_transmits; + ifp->probes = ifp->idev->cnf.dad_transmits; ifp->flags |= DAD_INCOMPLETE; - rand_num = ipv6_random() % ipv6_config.rtr_solicit_delay; + rand_num = net_random() % ifp->idev->cnf.rtr_solicit_delay; ifp->timer.function = addrconf_dad_timer; ifp->timer.expires = jiffies + rand_num; @@ -1215,11 +1311,16 @@ static void addrconf_dad_timer(unsigned long data) /* send a neighbour solicitation for our addr */ memset(&unspec, 0, sizeof(unspec)); - addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(&ifp->addr, &mcaddr); ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); +#endif +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(&ifp->addr, &mcaddr); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); +#endif - ifp->timer.expires = jiffies + ipv6_config.rtr_solicit_interval; + ifp->timer.expires = jiffies + ifp->idev->cnf.rtr_solicit_interval; add_timer(&ifp->timer); } @@ -1231,20 +1332,18 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) * Configure the address for reception. Now it is valid. */ - ip6_rt_addr_add(&ifp->addr, dev); + ipv6_ifa_notify(RTM_NEWADDR, ifp); /* If added prefix is link local and forwarding is off, start sending router solicitations. */ - if (ipv6_config.forwarding == 0 && + if (ifp->idev->cnf.forwarding == 0 && (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) == 0 && (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { struct in6_addr all_routers; - ipv6_addr_set(&all_routers, - __constant_htonl(0xff020000U), 0, 0, - __constant_htonl(0x2U)); + ipv6_addr_all_routers(&all_routers); /* * If a host as already performed a random delay @@ -1256,7 +1355,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) ifp->probes = 1; ifp->timer.function = addrconf_rs_timer; ifp->timer.expires = (jiffies + - ipv6_config.rtr_solicit_interval); + ifp->idev->cnf.rtr_solicit_interval); ifp->idev->if_flags |= IF_RS_SENT; add_timer(&ifp->timer); } @@ -1319,52 +1418,365 @@ void addrconf_verify(unsigned long foo) for (i=0; i < IN6_ADDR_HSIZE; i++) { for (ifp=inet6_addr_lst[i]; ifp;) { + if (ifp->flags & ADDR_INVALID) { + struct inet6_ifaddr *bp = ifp; + ifp= ifp->lst_next; + ipv6_del_addr(bp); + continue; + } if (!(ifp->flags & ADDR_PERMANENT)) { struct inet6_ifaddr *bp; unsigned long age; age = (now - ifp->tstamp) / HZ; - if (age > ifp->prefered_lft) - ifp->flags |= ADDR_DEPRECATED; - bp = ifp; - ifp=ifp->lst_next; + ifp= ifp->lst_next; if (age > bp->valid_lft) ipv6_del_addr(bp); + else if (age > bp->prefered_lft) { + bp->flags |= ADDR_DEPRECATED; + ipv6_ifa_notify(0, bp); + } continue; } - ifp=ifp->lst_next; + ifp = ifp->lst_next; } } addr_chk_timer.expires = jiffies + ADDR_CHECK_FREQUENCY; - add_timer(&addr_chk_timer); + add_timer(&addr_chk_timer); } -/* - * Init / cleanup code - */ +#ifdef CONFIG_RTNETLINK -__initfunc(void addrconf_init(void)) +static int +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { -#ifdef MODULE - struct device *dev; + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + + return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + + return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, + pid_t pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = ifa->prefix_len; + ifm->ifa_flags = ifa->flags & ~ADDR_INVALID; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ifa->scope&IFA_HOST) + ifm->ifa_scope = RT_SCOPE_HOST; + else if (ifa->scope&IFA_LINK) + ifm->ifa_scope = RT_SCOPE_LINK; + else if (ifa->scope&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifa->idev->dev->ifindex; + RTA_PUT(skb, IFA_ADDRESS, 16, &ifa->addr); + if (!(ifa->flags&IFA_F_PERMANENT)) { + ci.ifa_prefered = ifa->prefered_lft; + ci.ifa_valid = ifa->valid_lft; + if (ci.ifa_prefered != 0xFFFFFFFF) { + long tval = (jiffies - ifa->tstamp)/HZ; + ci.ifa_prefered -= tval; + if (ci.ifa_valid != 0xFFFFFFFF) + ci.ifa_valid -= tval; + } + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + struct inet6_ifaddr *ifa; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + + for (idx=0; idx < IN6_ADDR_HSIZE; idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + start_bh_atomic(); + for (ifa=inet6_addr_lst[idx], ip_idx = 0; ifa; + ifa = ifa->lst_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet6_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) { + end_bh_atomic(); + goto done; + } + } + end_bh_atomic(); + } +done: + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); + return; + } + if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); +} + +static struct rtnetlink_link inet6_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + + { inet6_rtm_newaddr, NULL, }, + { inet6_rtm_deladdr, NULL, }, + { NULL, inet6_dump_ifaddr, }, + { NULL, NULL, }, + + { inet6_rtm_newroute, NULL, }, + { inet6_rtm_delroute, NULL, }, + { NULL, inet6_dump_fib, }, + { NULL, NULL, }, +}; #endif - /* - * init address and device hash lists - */ +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ +#ifdef CONFIG_RTNETLINK + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); +#endif + switch (event) { + case RTM_NEWADDR: + ip6_rt_addr_add(&ifp->addr, ifp->idev->dev); + break; + case RTM_DELADDR: + start_bh_atomic(); + addrconf_leave_solict(ifp->idev->dev, &ifp->addr); + if (ipv6_chk_addr(&ifp->addr, ifp->idev->dev, 0) == NULL) + ip6_rt_addr_del(&ifp->addr, ifp->idev->dev); + end_bh_atomic(); + break; + } +} - memset(inet6_addr_lst, 0, IN6_ADDR_HSIZE * sizeof(struct inet6_ifaddr *)); +#ifdef CONFIG_SYSCTL - memset(inet6_mcast_lst, 0, IN6_ADDR_HSIZE * sizeof(struct ifmcaddr6 *)); +static +int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret; - memset(inet6_dev_lst, 0, IN6_ADDR_HSIZE * sizeof(struct inet6_dev *)); + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + if (write && *valp != val && valp != &ipv6_devconf_dflt.forwarding) { + struct inet6_dev *idev = NULL; + + if (valp != &ipv6_devconf.forwarding) { + struct device *dev = dev_get_by_index(ctl->ctl_name); + if (dev) + idev = ipv6_get_idev(dev); + if (idev == NULL) + return ret; + } else + ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; + + addrconf_forward_change(idev); + + if (*valp) + rt6_purge_dflt_routers(0); + } + + return ret; +} + +static struct addrconf_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table addrconf_vars[11]; + ctl_table addrconf_dev[2]; + ctl_table addrconf_conf_dir[2]; + ctl_table addrconf_proto_dir[2]; + ctl_table addrconf_root_dir[2]; +} addrconf_sysctl = { + NULL, + {{NET_IPV6_FORWARDING, "forwarding", + &ipv6_devconf.forwarding, sizeof(int), 0644, NULL, + &addrconf_sysctl_forward}, + + {NET_IPV6_HOP_LIMIT, "hop_limit", + &ipv6_devconf.hop_limit, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_MTU, "mtu", + &ipv6_devconf.mtu6, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_ACCEPT_RA, "accept_ra", + &ipv6_devconf.accept_ra, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects", + &ipv6_devconf.accept_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_AUTOCONF, "autoconf", + &ipv6_devconf.autoconf, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_DAD_TRANSMITS, "dad_transmits", + &ipv6_devconf.dad_transmits, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_RTR_SOLICITS, "router_solicitations", + &ipv6_devconf.rtr_solicits, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval", + &ipv6_devconf.rtr_solicit_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + + {NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay", + &ipv6_devconf.rtr_solicit_delay, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + + {0}}, + + {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, addrconf_sysctl.addrconf_vars},{0}}, + {{NET_IPV6_CONF, "conf", NULL, 0, 0555, addrconf_sysctl.addrconf_dev},{0}}, + {{NET_IPV6, "ipv6", NULL, 0, 0555, addrconf_sysctl.addrconf_conf_dir},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, addrconf_sysctl.addrconf_proto_dir},{0}} +}; + +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +{ + int i; + struct device *dev = idev ? idev->dev : NULL; + struct addrconf_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + memcpy(t, &addrconf_sysctl, sizeof(*t)); + for (i=0; i<sizeof(t->addrconf_vars)/sizeof(t->addrconf_vars[0])-1; i++) { + t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + t->addrconf_vars[i].de = NULL; + } + if (dev) { + t->addrconf_dev[0].procname = dev->name; + t->addrconf_dev[0].ctl_name = dev->ifindex; + } else { + t->addrconf_dev[0].procname = "default"; + t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + t->addrconf_dev[0].child = t->addrconf_vars; + t->addrconf_dev[0].de = NULL; + t->addrconf_conf_dir[0].child = t->addrconf_dev; + t->addrconf_conf_dir[0].de = NULL; + t->addrconf_proto_dir[0].child = t->addrconf_conf_dir; + t->addrconf_proto_dir[0].de = NULL; + t->addrconf_root_dir[0].child = t->addrconf_proto_dir; + t->addrconf_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); + if (t->sysctl_header == NULL) + kfree(t); +} + +static void addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + if (p->sysctl) { + struct addrconf_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + + +#endif + +/* + * Init / cleanup code + */ + +__initfunc(void addrconf_init(void)) +{ #ifdef MODULE + struct device *dev; + /* This takes sense only during module load. */ for (dev = dev_base; dev; dev = dev->next) { @@ -1390,6 +1802,14 @@ __initfunc(void addrconf_init(void)) addr_chk_timer.expires = jiffies + ADDR_CHECK_FREQUENCY; add_timer(&addr_chk_timer); +#ifdef CONFIG_RTNETLINK + rtnetlink_links[AF_INET6] = inet6_rtnetlink_table; +#endif +#ifdef CONFIG_SYSCTL + addrconf_sysctl.sysctl_header = + register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0); + addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); +#endif } #ifdef MODULE @@ -1399,6 +1819,14 @@ void addrconf_cleanup(void) struct inet6_ifaddr *ifa; int i; +#ifdef CONFIG_RTNETLINK + rtnetlink_links[AF_INET6] = NULL; +#endif +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&ipv6_devconf_dflt); + addrconf_sysctl_unregister(&ipv6_devconf); +#endif + del_timer(&addr_chk_timer); /* @@ -1409,10 +1837,11 @@ void addrconf_cleanup(void) struct inet6_dev *next; for (idev = inet6_dev_lst[i]; idev; idev = next) { next = idev->next; - addrconf_ifdown(idev->dev); + addrconf_ifdown(idev->dev, 1); } } + start_bh_atomic(); /* * clean addr_list */ @@ -1423,9 +1852,13 @@ void addrconf_cleanup(void) bifa = ifa; ifa = ifa->lst_next; - kfree(bifa); + printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa); + /* Do not free it; something is wrong. + Now we can investigate it with debugger. + */ } } + end_bh_atomic(); #ifdef CONFIG_PROC_FS proc_net_unregister(iface_proc_entry.low_ino); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9f707272f..b0a0eb702 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.23 1997/10/29 20:27:52 kuznet Exp $ + * $Id: af_inet6.c,v 1.24 1997/12/13 21:53:08 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -116,8 +116,8 @@ static int inet6_create(struct socket *sock, int protocol) sk->timer.data = (unsigned long)sk; sk->timer.function = &net_timer; - sk->net_pinfo.af_inet6.hop_limit = ipv6_config.hop_limit; - sk->net_pinfo.af_inet6.mcast_hops = IPV6_DEFAULT_MCASTHOPS; + sk->net_pinfo.af_inet6.hop_limit = -1; + sk->net_pinfo.af_inet6.mcast_hops = -1; sk->net_pinfo.af_inet6.mc_loop = 1; /* Init the ipv4 part of the socket since we can have sockets @@ -209,7 +209,7 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (ipv6_chk_addr(&addr->sin6_addr) == NULL) + if (ipv6_chk_addr(&addr->sin6_addr, NULL, 0) == NULL) return(-EADDRNOTAVAIL); } } @@ -282,7 +282,7 @@ static int inet6_getname(struct socket *sock, struct sockaddr *uaddr, static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; - int err; + int err = -EINVAL; int pid; switch(cmd) @@ -318,47 +318,6 @@ static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(ipv6_route_ioctl(cmd,(void *)arg)); - case SIOCGIFCONF: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCADDMULTI: - case SIOCDELMULTI: -/* - - this ioctls deal with addresses - must process the addr info before - calling dev_ioctl to perform dev specific functions - - case SIOCGIFADDR: - case SIOCSIFADDR: - - - case SIOCGIFDSTADDR: - - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - */ - - case SIOCGIFMETRIC: - case SIOCSIFMETRIC: - case SIOCGIFMEM: - case SIOCSIFMEM: - case SIOCGIFMTU: - case SIOCSIFMTU: - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCSIFMAP: - case SIOCGIFMAP: - case SIOCSIFSLAVE: - case SIOCGIFSLAVE: - case SIOCGIFINDEX: - case SIOCGIFNAME: - case SIOCGIFCOUNT: - return(dev_ioctl(cmd,(void *) arg)); - case SIOCSIFADDR: return addrconf_add_ifaddr((void *) arg); case SIOCDIFADDR: @@ -370,9 +329,9 @@ static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) (cmd <= (SIOCDEVPRIVATE + 15))) return(dev_ioctl(cmd,(void *) arg)); - if (sk->prot->ioctl==NULL) - return(-EINVAL); - return(sk->prot->ioctl(sk, cmd, arg)); + if(sk->prot->ioctl==0 || (err=sk->prot->ioctl(sk, cmd, arg))==-ENOIOCTLCMD) + return(dev_ioctl(cmd,(void *) arg)); + return err; } /*NOTREACHED*/ return(0); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 90f7b25d9..875e0f2ed 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.12 1997/05/15 18:55:09 davem Exp $ + * $Id: datagram.c,v 1.13 1997/12/13 21:53:09 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -89,7 +89,7 @@ int datagram_send_ctl(struct msghdr *msg, struct device **src_dev, if (!ipv6_addr_any(&src_info->ipi6_addr)) { struct inet6_ifaddr *ifp; - ifp = ipv6_chk_addr(&src_info->ipi6_addr); + ifp = ipv6_chk_addr(&src_info->ipi6_addr, *src_dev, 0); if (ifp == NULL) { err = -EINVAL; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index b2380fb78..6b7508666 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -77,7 +77,7 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, pos += 1; icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -94,7 +94,7 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, pos += 3; icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -107,7 +107,7 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, addr_type = ipv6_addr_type(addr); if (addr_type == IPV6_ADDR_MULTICAST) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 28d9af57e..b84dc9268 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.11 1997/09/20 20:48:26 davem Exp $ + * $Id: icmp.c,v 1.12 1997/12/13 21:53:10 kuznet Exp $ * * Based on net/ipv4/icmp.c * @@ -179,7 +179,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(&hdr->daddr)) + if (ipv6_chk_addr(&hdr->daddr, NULL, 0)) saddr = &hdr->daddr; /* @@ -499,7 +499,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, }; discard_it: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6c9f24492..15ce420ac 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.9 1997/09/20 20:48:27 davem Exp $ + * $Id: ip6_fib.c,v 1.10 1997/12/13 21:53:10 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -45,8 +45,6 @@ struct rt6_statistics rt6_stats; static __u32 rt_sernum = 0; -static void fib6_run_gc(unsigned long); - static struct timer_list ip6_fib_timer = { NULL, NULL, 0, @@ -182,6 +180,16 @@ static __inline__ void node_free(struct fib6_node * fn) kfree(fn); } +extern __inline__ void rt6_release(struct rt6_info *rt) +{ + struct dst_entry *dst = (struct dst_entry *) rt; + if (atomic_dec_and_test(&dst->refcnt)) { + rt->rt6i_node = NULL; + dst_free(dst); + } +} + + /* * Routing Table * @@ -409,8 +417,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) if ((iter->rt6i_dev == rt->rt6i_dev) && (iter->rt6i_flowr == rt->rt6i_flowr) && (ipv6_addr_cmp(&iter->rt6i_gateway, - &rt->rt6i_gateway) == 0)) + &rt->rt6i_gateway) == 0)) { + if (rt->rt6i_expires == 0 || + (long)(rt->rt6i_expires - iter->rt6i_expires) > 0) + rt->rt6i_expires = iter->rt6i_expires; return -EEXIST; + } } if (iter->rt6i_metric > rt->rt6i_metric) @@ -426,6 +438,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) *ins = rt; rt->u.next = iter; atomic_inc(&rt->rt6i_ref); +#ifdef CONFIG_RTNETLINK + inet6_rt_notify(RTM_NEWROUTE, rt); +#endif rt6_stats.fib_rt_entries++; if ((fn->fn_flags & RTN_RTINFO) == 0) { @@ -440,7 +455,8 @@ static __inline__ void fib6_start_gc(struct rt6_info *rt) { if ((ip6_fib_timer.expires == 0) && (rt->rt6i_flags & (RTF_ADDRCONF | RTF_CACHE))) { - ip6_fib_timer.expires = jiffies + ipv6_config.rt_gc_period; + del_timer(&ip6_fib_timer); + ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval; add_timer(&ip6_fib_timer); } } @@ -513,6 +529,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt) if (err == 0) fib6_start_gc(rt); out: + if (err) + dst_free(&rt->u.dst); return err; } @@ -782,7 +800,11 @@ static struct fib6_node * fib6_del_1(struct rt6_info *rt) */ *back = lf->u.next; +#ifdef CONFIG_RTNETLINK + inet6_rt_notify(RTM_DELROUTE, lf); +#endif rt6_release(lf); + rt6_stats.fib_rt_entries--; return fn; } back = &lf->u.next; @@ -810,14 +832,19 @@ int fib6_del(struct rt6_info *rt) /* * Tree transversal function * + * Wau... It is NOT REENTERABLE!!!!!!! It is cathastrophe. --ANK */ +int fib6_walk_count; + void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg, int filter) { struct fib6_node *fn; fn = root; + + fib6_walk_count++; do { if (!(fn->fn_flags & RTN_TAG)) { @@ -858,6 +885,8 @@ void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg, } while (!(fn->fn_flags & RTN_TAG)); } while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG)); + + fib6_walk_count--; } /* @@ -874,7 +903,7 @@ static int fib6_gc_node(struct fib6_node *fn, int timeout) for (rt = fn->leaf; rt;) { if ((rt->rt6i_flags & RTF_CACHE) && atomic_read(&rt->rt6i_use) == 0) { - if (now - rt->rt6i_tstamp > timeout) { + if ((long)(now - rt->rt6i_tstamp) >= timeout) { struct rt6_info *old; old = rt; @@ -884,6 +913,10 @@ static int fib6_gc_node(struct fib6_node *fn, int timeout) *back = rt; old->rt6i_node = NULL; +#ifdef CONFIG_RTNETLINK + inet6_rt_notify(RTM_DELROUTE, old); +#endif + old->u.dst.obsolete = 1; rt6_release(old); rt6_stats.fib_rt_entries--; continue; @@ -893,7 +926,28 @@ static int fib6_gc_node(struct fib6_node *fn, int timeout) /* * check addrconf expiration here. + * + * BUGGGG Crossing fingers and ... + * Seems, radix tree walking is absolutely broken, + * but we will try in any case --ANK */ + if (rt->rt6i_expires && (long)(now - rt->rt6i_expires) < 0) { + struct rt6_info *old; + + old = rt; + rt = rt->u.next; + + *back = rt; + + old->rt6i_node = NULL; +#ifdef CONFIG_RTNETLINK + inet6_rt_notify(RTM_DELROUTE, old); +#endif + old->u.dst.obsolete = 1; + rt6_release(old); + rt6_stats.fib_rt_entries--; + continue; + } back = &rt->u.next; rt = rt->u.next; } @@ -987,17 +1041,25 @@ static void fib6_garbage_collect(struct fib6_node *fn, void *p_arg) } } -static void fib6_run_gc(unsigned long dummy) +void fib6_run_gc(unsigned long dummy) { struct fib6_gc_args arg = { - ipv6_config.rt_cache_timeout, + ip6_rt_gc_timeout, 0 }; - fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0); + del_timer(&ip6_fib_timer); + + if (dummy) + arg.timeout = dummy; + + if (fib6_walk_count == 0) + fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0); + else + arg.more = 1; if (arg.more) { - ip6_fib_timer.expires = jiffies + ipv6_config.rt_gc_period; + ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval; add_timer(&ip6_fib_timer); } else { ip6_fib_timer.expires = 0; diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index ddce1ccfa..7316a30f1 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fw.c,v 1.7 1997/10/06 23:09:54 davem Exp $ + * $Id: ip6_fw.c,v 1.8 1997/12/13 21:53:11 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -266,14 +266,14 @@ int ip6_fw_reject(struct sk_buff *skb) * send it via netlink, as (rule, skb) */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } int ip6_fw_discard(struct sk_buff *skb) { printk(KERN_DEBUG "ip6_fw: BUG fw_reject called\n"); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -302,6 +302,7 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg) rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY; rt = ip6_route_add(&rtmsg, &err); + /* BUGGGG! rt can point to nowhere. */ if (rt == NULL) { ip6_fwrule_free(rl); return -ENOMEM; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 72ce290ae..ead32047a 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -121,7 +121,7 @@ static int ip6_dstopt_unknown(struct sk_buff *skb, struct ipv6_tlvtype *hdr) ICMPV6_UNK_OPTION, pos, skb->dev); }; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -183,7 +183,7 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) int pkt_len; if (skb->pkt_type == PACKET_OTHERHOST) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -204,7 +204,7 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) return 0; err: ipv6_statistics.Ip6InHdrErrors++; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -358,7 +358,7 @@ st_loop: offset = nhptr - (u8*) hdr; icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, offset, skb->dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } return 0; @@ -407,7 +407,7 @@ int ip6_mc_input(struct sk_buff *skb) } if (discard) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e0b20e066..67b81d041 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.5 1997/09/21 18:33:14 kuznet Exp $ + * $Id: ip6_output.c,v 1.7 1997/12/29 19:52:46 kuznet Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -35,32 +35,49 @@ static u32 ipv6_fragmentation_id = 1; -static void ipv6_build_mac_hdr(struct sk_buff *skb, struct dst_entry *dst, - int len) +int ip6_output(struct sk_buff *skb) { - struct device *dev; - - - dev = dst->dev; + struct dst_entry *dst = skb->dst; + struct device *dev = dst->dev; + struct hh_cache *hh = dst->hh; - skb->arp = 1; - - if (dev->hard_header) { - int mac; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->dev = dev; - /* Maybe when Alexey has done his new magic I'll hack this - it seems to be worth 1-2% on IPv4 */ -#if 0 - if (dst->hh) - hh_copy_header(dst->hh, skb); -#endif - mac = dev->hard_header(skb, dev, ETH_P_IPV6, NULL, NULL, len); + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) { + if (!(dev->flags&IFF_LOOPBACK) && + (skb->sk == NULL || skb->sk->net_pinfo.af_inet6.mc_loop) && + ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr)) { + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + dev_loopback_xmit(skb); - if (mac < 0) - skb->arp = 0; + if (skb->nh.ipv6h->hop_limit == 0) { + kfree_skb(skb); + return 0; + } + } } - - skb->mac.raw = skb->data; + + if (hh) { +#ifdef __alpha__ + /* Alpha has disguisting memcpy. Help it. */ + u64 *aligned_hdr = (u64*)(skb->data - 16); + u64 *aligned_hdr0 = hh->hh_data; + aligned_hdr[0] = aligned_hdr0[0]; + aligned_hdr[1] = aligned_hdr0[1]; +#else + memcpy(skb->data - 16, hh->hh_data, 16); +#endif + skb_push(skb, dev->hard_header_len); + return hh->hh_output(skb); + } else if (dst->neighbour) + return dst->neighbour->output(skb); + + printk(KERN_DEBUG "khm\n"); + kfree_skb(skb); + return -EINVAL; } /* @@ -78,14 +95,15 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, hdr = skb->nh.ipv6h; - if (sk) + if (sk) { np = &sk->net_pinfo.af_inet6; - if (np && np->dst) { - /* - * dst_check returns NULL if route is no longer valid - */ - dst = dst_check(&dst, np->dst_cookie); + if (sk->dst_cache) { + /* + * dst_check returns NULL if route is no longer valid + */ + dst = dst_check(&sk->dst_cache, np->dst_cookie); + } } if (dst == NULL) { @@ -95,24 +113,15 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, /* * NETUNREACH usually */ + dst_release(dst); return dst->error; } } skb->dst = dst_clone(dst); - skb->dev = dst->dev; seg_len = skb->tail - ((unsigned char *) hdr); - - /* - * Link Layer headers - */ - - skb->protocol = __constant_htons(ETH_P_IPV6); hdr = skb->nh.ipv6h; - ipv6_build_mac_hdr(skb, dst, seg_len); - - /* * Fill in the IPv6 header */ @@ -127,17 +136,21 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, hdr->payload_len = htons(seg_len - sizeof(struct ipv6hdr)); hdr->nexthdr = fl->proto; - hdr->hop_limit = np ? np->hop_limit : ipv6_config.hop_limit; - + if (np == NULL || np->hop_limit < 0) + hdr->hop_limit = ((struct rt6_info*)dst)->rt6i_hoplimit; + else + hdr->hop_limit = np->hop_limit; + ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); ipv6_statistics.Ip6OutRequests++; dst->output(skb); - if (sk) - ip6_dst_store(sk, dst); - else + if (sk) { + if (sk->dst_cache == NULL) + ip6_dst_store(sk, dst); + } else dst_release(dst); return 0; @@ -163,8 +176,6 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct device *dev, totlen = len + sizeof(struct ipv6hdr); - skb->mac.raw = skb->data; - hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); skb->nh.ipv6h = hdr; @@ -211,7 +222,7 @@ static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl, static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, struct dst_entry *dst, struct flowi *fl, struct ipv6_options *opt, - int hlimit, int flags, unsigned short length) + int hlimit, int flags, unsigned length) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6hdr *hdr; @@ -245,8 +256,6 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, payl_len += opt->opt_flen; } - nfrags = payl_len / ((dst->pmtu - unfrag_len) & ~0x7); - /* * Length of fragmented part on every packet but * the last must be an: @@ -255,6 +264,8 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, frag_len = (dst->pmtu - unfrag_len) & ~0x7; + nfrags = payl_len / frag_len; + /* * We must send from end to start because of * UDP/ICMP checksums. We do a funny trick: @@ -281,18 +292,9 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, return err; last_skb->dst = dst_clone(dst); - last_skb->dev = dst->dev; - last_skb->protocol = htons(ETH_P_IPV6); last_skb->when = jiffies; - last_skb->arp = 0; - /* - * build the mac header... - */ - if (dst->dev->hard_header_len) { - skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15); - ipv6_build_mac_hdr(last_skb, dst, unfrag_len + frag_len); - } + skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15); hdr = (struct ipv6hdr *) skb_put(last_skb, sizeof(struct ipv6hdr)); last_skb->nh.ipv6h = hdr; @@ -335,7 +337,9 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, struct frag_hdr *fhdr2; +#if 0 printk(KERN_DEBUG "sending frag %d\n", nfrags); +#endif skb = skb_copy(last_skb, sk->allocation); if (skb == NULL) @@ -356,7 +360,7 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, nfrags * frag_len, frag_len); if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); break; } @@ -366,11 +370,13 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, } if (err) { - kfree_skb(last_skb, FREE_WRITE); + kfree_skb(last_skb); return -EFAULT; } +#if 0 printk(KERN_DEBUG "sending last frag \n"); +#endif hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr)); @@ -383,18 +389,6 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, last_skb->tail += last_len; last_skb->len += last_len; - /* - * toss the mac header out and rebuild it. - * needed because of the different frame length. - * ie: not needed for an ethernet. - */ - - if (dst->dev->type != ARPHRD_ETHER && last_len != frag_len) { - skb_pull(last_skb, (unsigned char *)last_skb->nh.ipv6h - - last_skb->data); - ipv6_build_mac_hdr(last_skb, dst, unfrag_len + last_len); - } - ipv6_statistics.Ip6OutRequests++; dst->output(last_skb); @@ -402,7 +396,7 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, } int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, - struct flowi *fl, unsigned short length, + struct flowi *fl, unsigned length, struct ipv6_options *opt, int hlimit, int flags) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; @@ -419,8 +413,8 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, dst = NULL; - if (np->dst) - dst = dst_check(&np->dst, np->dst_cookie); + if (sk->dst_cache) + dst = dst_check(&sk->dst_cache, np->dst_cookie); if (dst == NULL) dst = ip6_route_output(sk, fl); @@ -449,13 +443,29 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, pktlength = length; - if (hlimit < 0) - hlimit = np->hop_limit; + if (hlimit < 0) { + if (ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; + } if (!sk->ip_hdrincl) { pktlength += sizeof(struct ipv6hdr); if (opt) pktlength += opt->opt_flen + opt->opt_nflen; + + /* Due to conservative check made by caller, + pktlength cannot overflow here. + + When (and if) jumbo option will be implemented + we could try soemething sort of: + + if (pktlength < length) return -EMSGSIZE; + + */ } if (pktlength <= dst->pmtu) { @@ -475,19 +485,13 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, dev = dst->dev; skb->dst = dst_clone(dst); - skb->dev = dev; - skb->protocol = htons(ETH_P_IPV6); skb->when = jiffies; - skb->arp = 0; - if (dev && dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - ipv6_build_mac_hdr(skb, dst, pktlength); - } + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); hdr = (struct ipv6hdr *) skb->tail; skb->nh.ipv6h = hdr; - + if (!sk->ip_hdrincl) { ip6_bld_1(sk, skb, fl, hlimit, pktlength); #if 0 @@ -511,14 +515,23 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, dst->output(skb); } else { err = -EFAULT; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } } else { if (sk->ip_hdrincl) return -EMSGSIZE; - + + /* pktlength includes IPv6 header, not included + in IPv6 payload length. + FIXME are non-fragmentable options included + in packet after defragmentation? If not, we + should subtract opt_nflen also. --ANK + */ + if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) + return -EMSGSIZE; + err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, hlimit, - flags, pktlength); + flags, length); } /* @@ -526,7 +539,7 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, */ out: - if (np->dst) + if (sk->dst_cache) ip6_dst_store(sk, dst); else dst_release(dst); @@ -540,8 +553,8 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = skb->nh.ipv6h; int size; - if (ipv6_config.forwarding == 0) { - kfree_skb(skb, FREE_READ); + if (ipv6_devconf.forwarding == 0) { + kfree_skb(skb); return -EINVAL; } @@ -560,7 +573,7 @@ int ip6_forward(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0, skb->dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -ETIMEDOUT; } @@ -569,7 +582,7 @@ int ip6_forward(struct sk_buff *skb) if (skb->dev == dst->dev && dst->neighbour) { struct in6_addr *target = NULL; struct rt6_info *rt; - struct nd_neigh *ndn = (struct nd_neigh *) dst->neighbour; + struct neighbour *n = dst->neighbour; /* * incoming and outgoing devices are the same @@ -578,7 +591,7 @@ int ip6_forward(struct sk_buff *skb) rt = (struct rt6_info *) dst; if ((rt->rt6i_flags & RTF_GATEWAY)) - target = &ndn->ndn_addr; + target = (struct in6_addr*)&n->primary_key; else target = &hdr->daddr; @@ -589,46 +602,17 @@ int ip6_forward(struct sk_buff *skb) if (size > dst->pmtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return -EMSGSIZE; } - skb->dev = dst->dev; - - /* - * Rebuild the mac header - */ - if (skb_headroom(skb) < dst->dev->hard_header_len) { - struct sk_buff *buff; - - buff = alloc_skb(dst->dev->hard_header_len + skb->len + 15, - GFP_ATOMIC); - - if (buff == NULL) { - kfree_skb(skb, FREE_WRITE); - return -ENOMEM; - } - - skb_reserve(buff, (dst->dev->hard_header_len + 15) & ~15); - - buff->protocol = __constant_htons(ETH_P_IPV6); - buff->h.raw = skb_put(buff, size); - buff->dst = dst_clone(dst); - buff->dev = dst->dev; - - memcpy(buff->h.raw, hdr, size); - buff->nh.ipv6h = (struct ipv6hdr *) buff->h.raw; - kfree_skb(skb, FREE_READ); - skb = buff; - } else { - skb_pull(skb, skb->nh.raw - skb->data); + if (skb_headroom(skb) < dst->dev->hard_header_len || skb_cloned(skb)) { + struct sk_buff *skb2; + skb2 = skb_realloc_headroom(skb, (dst->dev->hard_header_len + 15)&~15); + kfree_skb(skb); + skb = skb2; } - ipv6_build_mac_hdr(skb, dst, size); - - if (dst->neighbour) - ndisc_event_send(dst->neighbour, skb); - ipv6_statistics.Ip6ForwDatagrams++; dst->output(skb); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 98d8339b2..f2ef3fd76 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.15 1997/10/29 20:27:54 kuznet Exp $ + * $Id: ipv6_sockglue.c,v 1.16 1997/12/13 21:53:13 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -132,7 +132,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, break; case IPV6_UNICAST_HOPS: - if (val > 255) + if (val > 255 || val < -1) retv = -EINVAL; else { np->hop_limit = val; @@ -141,16 +141,18 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, break; case IPV6_MULTICAST_HOPS: - if (val > 255) + if (val > 255 || val < -1) retv = -EINVAL; else { np->mcast_hops = val; retv = 0; } break; + break; case IPV6_MULTICAST_LOOP: - np->mc_loop = val; + np->mc_loop = (val != 0); + retv = 0; break; case IPV6_MULTICAST_IF: @@ -166,7 +168,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, } else { struct inet6_ifaddr *ifp; - ifp = ipv6_chk_addr(&addr); + ifp = ipv6_chk_addr(&addr, NULL, 0); if (ifp == NULL) { retv = -EADDRNOTAVAIL; @@ -182,39 +184,16 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; - struct device *dev = NULL; int err; err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)); if(err) return -EFAULT; - if (mreq.ipv6mr_ifindex == 0) { -#if 0 - struct in6_addr mcast; - struct dest_entry *dc; - - ipv6_addr_set(&mcast, __constant_htonl(0xff000000), - 0, 0, 0); - dc = ipv6_dst_route(&mcast, NULL, 0); - - if (dc) - { - dev = dc->rt.rt_dev; - ipv6_dst_unlock(dc); - } -#endif - } else { - dev = dev_get_by_index(mreq.ipv6mr_ifindex); - } - - if (dev == NULL) - return -ENODEV; - if (optname == IPV6_ADD_MEMBERSHIP) - retv = ipv6_sock_mc_join(sk, dev, &mreq.ipv6mr_multiaddr); + retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else - retv = ipv6_sock_mc_drop(sk, dev, &mreq.ipv6mr_multiaddr); + retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); } }; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index eae3efed6..3f881673c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: mcast.c,v 1.11 1997/10/29 20:27:50 kuznet Exp $ + * $Id: mcast.c,v 1.13 1998/01/04 15:28:31 mj Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -16,6 +16,7 @@ */ #define __NO_VERSION__ +#include <linux/config.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -28,6 +29,7 @@ #include <linux/if_arp.h> #include <linux/route.h> #include <linux/init.h> +#include <linux/proc_fs.h> #include <net/sock.h> #include <net/snmp.h> @@ -37,6 +39,7 @@ #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/ip6_route.h> #include <net/checksum.h> @@ -59,27 +62,24 @@ void igmp6_timer_handler(unsigned long data); #define IGMP6_UNSOLICITED_IVAL (10*HZ) /* + * Hash list of configured multicast addresses + */ +static struct ifmcaddr6 *inet6_mcast_lst[IN6_ADDR_HSIZE]; + +/* * socket join on multicast group */ -int ipv6_sock_mc_join(struct sock *sk, struct device *dev, - struct in6_addr *addr) +int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) { + struct device *dev = NULL; struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; int err; - MDBG(("ipv6_sock_mc_join(%s) addr[", dev ? dev->name : "[NULL]")); - MDBG(("%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", - addr->s6_addr16[0], addr->s6_addr16[1], addr->s6_addr16[2], - addr->s6_addr16[3], addr->s6_addr16[4], addr->s6_addr16[5], - addr->s6_addr16[6], addr->s6_addr16[7])); if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST)) return -EINVAL; - if(!(dev->flags & IFF_MULTICAST)) - return -EADDRNOTAVAIL; - mc_lst = kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (mc_lst == NULL) @@ -87,7 +87,20 @@ int ipv6_sock_mc_join(struct sock *sk, struct device *dev, mc_lst->next = NULL; memcpy(&mc_lst->addr, addr, sizeof(struct in6_addr)); - mc_lst->dev = dev; + mc_lst->ifindex = ifindex; + + if (ifindex == 0) { + struct rt6_info *rt; + rt = rt6_lookup(addr, NULL, NULL, 0); + if (rt) + dev = rt->rt6i_dev; + } else + dev = dev_get_by_index(ifindex); + + if (dev == NULL) { + kfree(mc_lst); + return -ENODEV; + } /* * now add/increase the group membership on the device @@ -109,30 +122,21 @@ int ipv6_sock_mc_join(struct sock *sk, struct device *dev, /* * socket leave on multicast group */ -int ipv6_sock_mc_drop(struct sock *sk, struct device *dev, - struct in6_addr *addr) +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6_mc_socklist *mc_lst, **lnk; - lnk = &np->ipv6_mc_list; - - MDBG(("ipv6_sock_mc_drop(%s) addr[", dev ? dev->name : "[NULL]")); - MDBG(("%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", - addr->s6_addr16[0], addr->s6_addr16[1], addr->s6_addr16[2], - addr->s6_addr16[3], addr->s6_addr16[4], addr->s6_addr16[5], - addr->s6_addr16[6], addr->s6_addr16[7])); - - for (mc_lst = *lnk ; mc_lst; mc_lst = mc_lst->next) { - if (mc_lst->dev == dev && + for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { + if (mc_lst->ifindex == ifindex && ipv6_addr_cmp(&mc_lst->addr, addr) == 0) { + struct device *dev; *lnk = mc_lst->next; - ipv6_dev_mc_dec(mc_lst->dev, &mc_lst->addr); + if ((dev = dev_get_by_index(ifindex)) != NULL) + ipv6_dev_mc_dec(dev, &mc_lst->addr); kfree(mc_lst); - return 0; } - lnk = &mc_lst->next; } return -ENOENT; @@ -143,21 +147,48 @@ void ipv6_sock_mc_close(struct sock *sk) struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6_mc_socklist *mc_lst; - for (mc_lst = np->ipv6_mc_list; mc_lst; ) { - struct ipv6_mc_socklist *back; + while ((mc_lst = np->ipv6_mc_list) != NULL) { + struct device *dev = dev_get_by_index(mc_lst->ifindex); - /* - * leave group - */ + if (dev) + ipv6_dev_mc_dec(dev, &mc_lst->addr); - ipv6_dev_mc_dec(mc_lst->dev, &mc_lst->addr); + np->ipv6_mc_list = mc_lst->next; + kfree(mc_lst); + } +} - back = mc_lst; - mc_lst = mc_lst->next; - kfree(back); +static int igmp6_group_added(struct ifmcaddr6 *mc) +{ + char buf[MAX_ADDR_LEN]; + + if (!(mc->mca_flags&MAF_LOADED)) { + mc->mca_flags |= MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, mc->dev, 0) == 0) + dev_mc_add(mc->dev, buf, mc->dev->addr_len, 0); } + + if (mc->dev->flags&IFF_UP) + igmp6_join_group(mc); + return 0; } +static int igmp6_group_dropped(struct ifmcaddr6 *mc) +{ + char buf[MAX_ADDR_LEN]; + + if (mc->mca_flags&MAF_LOADED) { + mc->mca_flags &= ~MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, mc->dev, 0) == 0) + dev_mc_delete(mc->dev, buf, mc->dev->addr_len, 0); + } + + if (mc->dev->flags&IFF_UP) + igmp6_leave_group(mc); + return 0; +} + + /* * device multicast group inc (add if not found) */ @@ -165,30 +196,17 @@ int ipv6_dev_mc_inc(struct device *dev, struct in6_addr *addr) { struct ifmcaddr6 *mc; struct inet6_dev *idev; - char buf[6]; int hash; - MDBG(("ipv6_dev_mc_inc(%s) addr[", dev ? dev->name : "[NULL]")); - MDBG(("%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", - addr->s6_addr16[0], addr->s6_addr16[1], addr->s6_addr16[2], - addr->s6_addr16[3], addr->s6_addr16[4], addr->s6_addr16[5], - addr->s6_addr16[6], addr->s6_addr16[7])); - hash = ipv6_devindex_hash(dev->ifindex); - - for (idev = inet6_dev_lst[hash]; idev; idev=idev->next) - if (idev->dev == dev) - break; + idev = ipv6_get_idev(dev); - if (idev == NULL) { - printk(KERN_DEBUG "ipv6_dev_mc_inc: device not found\n"); + if (idev == NULL) return -EINVAL; - } hash = ipv6_addr_hash(addr); for (mc = inet6_mcast_lst[hash]; mc; mc = mc->next) { - if ((ipv6_addr_cmp(&mc->mca_addr, addr) == 0) && - (mc->dev->ifindex == dev->ifindex)) { + if (ipv6_addr_cmp(&mc->mca_addr, addr) == 0 && mc->dev == dev) { atomic_inc(&mc->mca_users); return 0; } @@ -203,7 +221,6 @@ int ipv6_dev_mc_inc(struct device *dev, struct in6_addr *addr) if (mc == NULL) return -ENOMEM; - MDBG(("create new ipv6 MC entry, ")); memset(mc, 0, sizeof(struct ifmcaddr6)); mc->mca_timer.function = igmp6_timer_handler; mc->mca_timer.data = (unsigned long) mc; @@ -218,23 +235,7 @@ int ipv6_dev_mc_inc(struct device *dev, struct in6_addr *addr) mc->if_next = idev->mc_list; idev->mc_list = mc; - /* - * multicast mapping is defined in IPv6-over-foo documents - */ - - switch (dev->type) { - case ARPHRD_ETHER: - ipv6_mc_map(addr, buf); - MDBG(("ARPHRD_ETHER[%02x:%02x:%02x:%02x:%02x:%02x] dev_mc_add()\n", - buf[0], buf[1], buf[2], buf[3], buf[4], buf[5])); - dev_mc_add(dev, buf, ETH_ALEN, 0); - break; - - default: - printk(KERN_DEBUG "dev_mc_inc: unkown device type\n"); - }; - - igmp6_join_group(mc); + igmp6_group_added(mc); return 0; } @@ -247,15 +248,12 @@ static void ipv6_mca_remove(struct device *dev, struct ifmcaddr6 *ma) if (idev) { struct ifmcaddr6 *iter, **lnk; - - lnk = &idev->mc_list; - for (iter = *lnk; iter; iter = iter->if_next) { + for (lnk = &idev->mc_list; (iter = *lnk) != NULL; lnk = &iter->if_next) { if (iter == ma) { *lnk = iter->if_next; - break; + return; } - lnk = &iter->if_next; } } } @@ -270,19 +268,16 @@ int ipv6_dev_mc_dec(struct device *dev, struct in6_addr *addr) hash = ipv6_addr_hash(addr); - lnk = &inet6_mcast_lst[hash]; - - for (ma = inet6_mcast_lst[hash]; ma; ma = ma->next) { - if (ipv6_addr_cmp(&ma->mca_addr, addr) == 0) { + for (lnk = &inet6_mcast_lst[hash]; (ma=*lnk) != NULL; lnk = &ma->next) { + if (ipv6_addr_cmp(&ma->mca_addr, addr) == 0 && ma->dev == dev) { if (atomic_dec_and_test(&ma->mca_users)) { - igmp6_leave_group(ma); + igmp6_group_dropped(ma); *lnk = ma->next; - ipv6_mca_remove(ma->dev, ma); + ipv6_mca_remove(dev, ma); kfree(ma); } return 0; } - lnk = &ma->next; } return -ENOENT; @@ -299,7 +294,7 @@ int ipv6_chk_mcast_addr(struct device *dev, struct in6_addr *addr) hash = ipv6_addr_hash(addr); for (mc = inet6_mcast_lst[hash]; mc; mc=mc->next) { - if ((mc->dev == dev) && ipv6_addr_cmp(&mc->mca_addr, addr) == 0) + if (mc->dev == dev && ipv6_addr_cmp(&mc->mca_addr, addr) == 0) return 1; } @@ -312,11 +307,15 @@ int ipv6_chk_mcast_addr(struct device *dev, struct in6_addr *addr) static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { - unsigned long delay; + unsigned long delay = resptime; - ma->mca_flags |= MAF_TIMER_RUNNING; + if (del_timer(&ma->mca_timer)) + delay = ma->mca_timer.expires - jiffies; + + if (delay >= resptime) + delay = net_random() % resptime; - delay = ipv6_random() % resptime; + ma->mca_flags |= MAF_TIMER_RUNNING; ma->mca_timer.expires = jiffies + delay; add_timer(&ma->mca_timer); } @@ -408,22 +407,16 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) plen = sizeof(struct ipv6hdr) + len; - skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen, 0, 0, &err); + skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen + 15, 0, 0, &err); if (skb == NULL) return; - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - unsigned char ha[MAX_ADDR_LEN]; - if (dev->type == ARPHRD_ETHER) - ipv6_mc_map(addr, ha); - else - memcpy(ha, dev->broadcast, dev->addr_len); - dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen); - skb->arp = 1; - } + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + ndisc_mc_map(addr, ha, dev, 1); + dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen); } ifp = ipv6_get_lladdr(dev); @@ -468,11 +461,16 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT); - delay = ipv6_random() % IGMP6_UNSOLICITED_IVAL; + delay = net_random() % IGMP6_UNSOLICITED_IVAL; + start_bh_atomic(); + if (del_timer(&ma->mca_timer)) + delay = ma->mca_timer.expires - jiffies; + ma->mca_timer.expires = jiffies + delay; add_timer(&ma->mca_timer); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; + end_bh_atomic(); } static void igmp6_leave_group(struct ifmcaddr6 *ma) @@ -500,8 +498,111 @@ void igmp6_timer_handler(unsigned long data) ma->mca_flags &= ~MAF_TIMER_RUNNING; } +/* Device going down */ + +void ipv6_mc_down(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + struct in6_addr maddr; + + /* Withdraw multicast list */ + + for (i = idev->mc_list; i; i=i->if_next) + igmp6_group_dropped(i); + + /* Delete all-nodes address. */ + + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_dec(idev->dev, &maddr); +} + +/* Device going up */ + +void ipv6_mc_up(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + struct in6_addr maddr; + + /* Add all-nodes address. */ + + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(idev->dev, &maddr); + + /* Install multicast list, except for all-nodes (already installed) */ + + for (i = idev->mc_list; i; i=i->if_next) + igmp6_group_added(i); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ipv6_mc_destroy_dev(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + while ((i = idev->mc_list) != NULL) { + idev->mc_list = i->if_next; + igmp6_group_dropped(i); + kfree(i); + } +} + +#ifdef CONFIG_PROC_FS +static int igmp6_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0, begin=0; + struct ifmcaddr6 *im; + int len=0; + struct device *dev; + + for (dev = dev_base; dev; dev = dev->next) { + struct inet6_dev *idev; + + if ((idev = ipv6_get_idev(dev)) == NULL) + continue; + + for (im = idev->mc_list; im; im = im->if_next) { + int i; + + len += sprintf(buffer+len,"%-4d %-15s ", dev->ifindex, dev->name); + + for (i=0; i<16; i++) + len += sprintf(buffer+len, "%02x", im->mca_addr.s6_addr[i]); + + len+=sprintf(buffer+len, + " %5d %08X %ld\n", + atomic_read(&im->mca_users), + im->mca_flags, + (im->mca_flags&MAF_TIMER_RUNNING) ? im->mca_timer.expires-jiffies : 0); + + pos=begin+len; + if (pos < offset) { + len=0; + begin=pos; + } + if (pos > offset+length) + goto done; + } + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + __initfunc(void igmp6_init(struct net_proto_family *ops)) { +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; +#endif struct sock *sk; int err; @@ -525,4 +626,9 @@ __initfunc(void igmp6_init(struct net_proto_family *ops)) sk->num = 256; /* Don't receive any data */ sk->net_pinfo.af_inet6.hop_limit = 1; +#ifdef CONFIG_PROC_FS + ent = create_proc_entry("net/igmp6", 0, 0); + ent->read_proc = igmp6_read_proc; +#endif } + diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d92b6b9..3fb0680bc 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -19,15 +19,25 @@ * of an RA. * * Janos Farkas : kmalloc failure checks + * Alexey Kuznetsov : state machine reworked + * and moved to net/core. */ /* Set to 3 to get tracing... */ #define ND_DEBUG 1 -#if ND_DEBUG >= 3 -#define NDBG(x) printk x -#else -#define NDBG(x) +#define ND_PRINTK(x...) printk(KERN_DEBUG x) +#define ND_NOPRINTK(x...) do { ; } while(0) +#define ND_PRINTK0 ND_PRINTK +#define ND_PRINTK1 ND_NOPRINTK +#define ND_PRINTK2 ND_NOPRINTK +#if ND_DEBUG >= 1 +#undef ND_PRINTK1 +#define ND_PRINTK1 ND_PRINTK +#endif +#if ND_DEBUG >= 2 +#undef ND_PRINTK2 +#define ND_PRINTK2 ND_PRINTK #endif #define __NO_VERSION__ @@ -42,6 +52,9 @@ #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #include <linux/if_arp.h> #include <linux/ipv6.h> @@ -61,380 +74,187 @@ #include <net/checksum.h> #include <linux/proc_fs.h> -#define NCACHE_NUM_BUCKETS 32 - static struct inode ndisc_inode; static struct socket *ndisc_socket=&ndisc_inode.u.socket_i; -unsigned long nd_rand_seed = 152L; - -struct ndisc_statistics nd_stats; - -static struct neigh_table nd_tbl; - -unsigned int ndisc_hash(void *primary_key); -int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb); - -static struct neigh_ops nd_neigh_ops = { - ETH_P_IPV6, - ndisc_hash, - ndisc_eth_resolv, - NULL -}; - -static struct timer_list ndisc_timer; -static struct timer_list ndisc_gc_timer; - -/* - * Protocol variables - */ - -unsigned long nd_reachable_time = RECHABLE_TIME; -int nd_gc_interval = 5 * HZ; - -/* - * garbage collection timeout must be greater than reachable time - * since tstamp is updated by reachable confirmations only. - * gc_staletime actually means the time after last confirmation - * *NOT* after the last time the entry was used. - */ - -int nd_gc_staletime = 3 * RECHABLE_TIME; +static int ndisc_constructor(struct neighbour *neigh); +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); +static int pndisc_constructor(struct pneigh_entry *n); +static void pndisc_destructor(struct pneigh_entry *n); +static void pndisc_redo(struct sk_buff *skb); - -static int ndisc_event_timer(struct nd_neigh *ndn); - -unsigned long ipv6_random(void) +static struct neigh_ops ndisc_generic_ops = { - nd_rand_seed=nd_rand_seed*69069L+1; - return nd_rand_seed^jiffies; -} + AF_INET6, + NULL, + ndisc_solicit, + ndisc_error_report, + neigh_resolve_output, + neigh_connected_output, + dev_queue_xmit, + dev_queue_xmit +}; -static __inline__ unsigned long rand_reach_time(void) +static struct neigh_ops ndisc_hh_ops = { - unsigned long val; - - val = ipv6_random() % (MAX_RANDOM_FACTOR * - ipv6_config.nd_base_reachable_time); + AF_INET6, + NULL, + ndisc_solicit, + ndisc_error_report, + neigh_resolve_output, + neigh_resolve_output, + dev_queue_xmit, + dev_queue_xmit +}; - if (val < (MIN_RANDOM_FACTOR * ipv6_config.nd_base_reachable_time)) - val+= (MIN_RANDOM_FACTOR * ipv6_config.nd_base_reachable_time); - return val; -} - -unsigned int ndisc_hash(void *primary_key) +static struct neigh_ops ndisc_direct_ops = { - struct in6_addr *addr = (struct in6_addr *) primary_key; - __u32 hash_val; - - addr = (struct in6_addr *) primary_key; - - hash_val = addr->s6_addr32[2] ^ addr->s6_addr32[3]; - - hash_val ^= hash_val >> 16; - - return (hash_val & (NCACHE_NUM_BUCKETS - 1)); -} - -static int ndisc_gc_func(struct neighbour *neigh, void *arg); + AF_INET6, + NULL, + NULL, + NULL, + dev_queue_xmit, + dev_queue_xmit, + dev_queue_xmit, + dev_queue_xmit +}; -static void ndisc_periodic_timer(unsigned long arg) +struct neigh_table nd_tbl = { - static unsigned long last_rand = 0; - unsigned long now = jiffies; - - /* - * periodicly compute ReachableTime from random function - */ - - if ((now - last_rand) > REACH_RANDOM_INTERVAL) { - last_rand = now; - nd_reachable_time = rand_reach_time(); - } + NULL, + AF_INET6, + sizeof(struct neighbour) + sizeof(struct in6_addr), + sizeof(struct in6_addr), + ndisc_constructor, + pndisc_constructor, + pndisc_destructor, + pndisc_redo, + { NULL, NULL, &nd_tbl, 0, NULL, NULL, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 0, 64 }, + 30*HZ, 128, 512, 1024, +}; - neigh_table_lock(&nd_tbl); +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) - start_bh_atomic(); - if (atomic_read(&nd_tbl.tbl_lock) == 1) { - ntbl_walk_table(&nd_tbl, ndisc_gc_func, 0, 0, NULL); - ndisc_gc_timer.expires = now + nd_gc_interval; - } else { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "ndisc_gc delayed: table locked\n"); -#endif - ndisc_gc_timer.expires = now + HZ; - } - end_bh_atomic(); - - neigh_table_unlock(&nd_tbl); - - add_timer(&ndisc_gc_timer); -} - -static int ndisc_gc_func(struct neighbour *neigh, void *arg) +static u8 *ndisc_fill_option(u8 *opt, int type, void *data, int data_len) { - struct nd_neigh *ndn = (struct nd_neigh *) neigh; - unsigned long now = jiffies; - - if (atomic_read(&ndn->ndn_refcnt) == 0) { - switch (ndn->ndn_nud_state) { - - case NUD_REACHABLE: - case NUD_STALE: - if (now - ndn->ndn_tstamp < nd_gc_staletime) - break; - case NUD_FAILED: - return 1; - default: - }; - } - return 0; + int space = NDISC_OPT_SPACE(data_len); + + opt[0] = type; + opt[1] = space>>3; + memcpy(opt+2, data, data_len); + data_len += 2; + if ((space -= data_len) > 0) + memset(opt + data_len, 0, space); + return opt + space; } -static __inline__ void ndisc_add_timer(struct nd_neigh *ndn, int timer) +int ndisc_mc_map(struct in6_addr *addr, char *buf, struct device *dev, int dir) { - unsigned long now = jiffies; - unsigned long tval = ~0UL; - - ndn->ndn_expires = now + timer; - - if (del_timer(&ndisc_timer)) - tval = ndisc_timer.expires; - - tval = min(tval, ndn->ndn_expires); - - ndisc_timer.expires = tval; - add_timer(&ndisc_timer); -} - -static void ndisc_del_timer(struct nd_neigh *ndn) -{ - unsigned long tval = ~0UL; - unsigned long neigh_val; - - if (del_timer(&ndisc_timer)) - tval = ndisc_timer.expires; - - neigh_val = ndn->ndn_expires; - ndn->ndn_expires = 0; - - if (tval == neigh_val) { - int i; - - tval = ~0UL; - - neigh_table_lock(&nd_tbl); - - /* need to search the entire neighbour cache */ - for (i=0; i < nd_tbl.tbl_size; i++) { - struct neighbour *neigh, *head; - head = nd_tbl.hash_buckets[i]; - - if ((neigh = head) == NULL) - continue; - - do { - struct nd_neigh *n; - - n = (struct nd_neigh *) neigh; - - if ((n->ndn_nud_state & NUD_IN_TIMER) && - n->ndn_expires) - tval = min(tval, n->ndn_expires); - - neigh = neigh->next; - - } while (neigh != head); + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ + case ARPHRD_FDDI: + ipv6_eth_mc_map(addr, buf); + return 0; + default: + if (dir) { + memcpy(buf, dev->broadcast, dev->addr_len); + return 0; } - neigh_table_unlock(&nd_tbl); } - - if (tval == ~(0UL)) - return; - - ndisc_timer.expires = tval; - add_timer(&ndisc_timer); + return -EINVAL; } -static int ndisc_forced_gc(struct neighbour *neigh, void *arg) +static int ndisc_constructor(struct neighbour *neigh) { - struct nd_neigh *ndn = (struct nd_neigh *) neigh; + struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct device *dev = neigh->dev; + struct inet6_dev *in6_dev = ipv6_get_idev(dev); + int addr_type; - if (atomic_read(&ndn->ndn_refcnt) == 0) { - if (ndn->ndn_nud_state & NUD_IN_TIMER) - ndisc_del_timer(ndn); - - return 1; - } - return 0; -} + if (in6_dev == NULL) + return -EINVAL; -static struct nd_neigh * ndisc_new_neigh(struct device *dev, - struct in6_addr *addr) -{ - struct nd_neigh *ndn; + addr_type = ipv6_addr_type(addr); + if (in6_dev->nd_parms) + neigh->parms = in6_dev->nd_parms; - NDBG(("ndisc_new_neigh(")); - if(dev) - NDBG(("%s,", dev->name)); + if (addr_type&IPV6_ADDR_MULTICAST) + neigh->type = RTN_MULTICAST; else - NDBG(("[NULL],")); - NDBG(("[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]): ", - addr->s6_addr16[0], addr->s6_addr16[1], addr->s6_addr16[2], - addr->s6_addr16[3], addr->s6_addr16[4], addr->s6_addr16[5], - addr->s6_addr16[6], addr->s6_addr16[7])); - - ndn = (struct nd_neigh *) neigh_alloc(sizeof(struct nd_neigh), - &nd_neigh_ops); - if (ndn == NULL) { - -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "neigh_alloc: out of memory\n"); -#endif - - start_bh_atomic(); - if (atomic_read(&nd_tbl.tbl_lock) == 1) { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "ndisc_alloc: forcing gc\n"); -#endif - ntbl_walk_table(&nd_tbl, ndisc_forced_gc, 0, 0, NULL); + neigh->type = RTN_UNICAST; + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &ndisc_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + if (addr_type&IPV6_ADDR_MULTICAST) { + neigh->nud_state = NUD_NOARP; + ndisc_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags&IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } else if (dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); } - - end_bh_atomic(); -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "ndisc_alloc failed\n"); -#endif - return NULL; - } - - nd_stats.allocs++; - - ipv6_addr_copy(&ndn->ndn_addr, addr); - ndn->ndn_plen = 128; - ndn->ndn_type = ipv6_addr_type(addr); - ndn->ndn_dev = dev; - ndn->ndn_tstamp = jiffies; - - if ((ndn->ndn_type & IPV6_ADDR_MULTICAST)) { - NDBG(("MULTICAST(NCF_NOARP) ")); - ndn->ndn_flags |= NCF_NOARP; - } - - if (dev->type == ARPHRD_LOOPBACK || dev->type == ARPHRD_SIT) { - NDBG(("%s(NCF_NOARP) ", - (dev->type==ARPHRD_LOOPBACK) ? "LOOPBACK" : "SIT")); - ndn->ndn_flags |= NCF_NOARP; + if (dev->hard_header_cache) + neigh->ops = &ndisc_hh_ops; + else + neigh->ops = &ndisc_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; } - neigh_insert(&nd_tbl, (struct neighbour *) ndn); - NDBG(("returning ndn(%p)\n", ndn)); - return ndn; + return 0; } -/* - * Called when creating a new dest_cache entry for a given destination - * is likely that an entry for the refered gateway exists in cache - * - */ - -struct neighbour * ndisc_get_neigh(struct device *dev, struct in6_addr *addr) +static int pndisc_constructor(struct pneigh_entry *n) { - struct nd_neigh *neigh; - - /* - * neighbour cache: - * cached information about nexthop and addr resolution - */ - - if (dev == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "ndisc_get_neigh: NULL device\n"); + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct device *dev = n->dev; + + if (dev == NULL || ipv6_get_idev(dev) == NULL) + return -EINVAL; +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); #endif - return NULL; - } - - neigh_table_lock(&nd_tbl); - - neigh = (struct nd_neigh *) neigh_lookup(&nd_tbl, (void *) addr, - sizeof(struct in6_addr), dev); - if (neigh == NULL) { - neigh = ndisc_new_neigh(dev, addr); - - if (neigh == NULL) - return NULL; - } - - neigh_table_unlock(&nd_tbl); - - return neighbour_clone((struct neighbour *) neigh); +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +#endif + return 0; } -/* - * return values - * 0 - Address Resolution succeded, send packet - * 1 - Address Resolution unfinished / packet queued - */ - -int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb) +static void pndisc_destructor(struct pneigh_entry *n) { - struct nd_neigh *ndn = NULL; - - if (skb->dst) - ndn = (struct nd_neigh *) skb->dst->neighbour; + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct device *dev = n->dev; - if (ndn == NULL) { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "ndisc_eth_resolv: nexthop is NULL\n"); + if (dev == NULL || ipv6_get_idev(dev) == NULL) + return; +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); #endif - goto discard; - } - - if ((ndn->ndn_type & IPV6_ADDR_MULTICAST)) { - struct in6_addr *daddr; - - daddr = &skb->nh.ipv6h->daddr; - if (skb->dev->type == ARPHRD_ETHER) - ipv6_mc_map(daddr, h_dest); - else - memcpy(h_dest, skb->dev->broadcast, skb->dev->addr_len); - return 0; - } - - switch (ndn->ndn_nud_state) { - case NUD_FAILED: - case NUD_NONE: - ndisc_event_send((struct neighbour *)ndn, skb); - - case NUD_INCOMPLETE: - if (skb_queue_len(&ndn->neigh.arp_queue) >= NDISC_QUEUE_LEN) { - struct sk_buff *buff; - - buff = ndn->neigh.arp_queue.prev; - skb_unlink(buff); - dev_kfree_skb(buff, FREE_WRITE); - } - skb_queue_head(&ndn->neigh.arp_queue, skb); - return 1; - default: - ndisc_event_send((struct neighbour *)ndn, skb); - }; - - if ((ndn->ndn_flags & NTF_COMPLETE) == 0) { -#if ND_DEBUG >=1 - /* This shouldn't happen */ - printk(KERN_DEBUG "ND: using incomplete entry\n"); +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); #endif - } - memcpy(h_dest, ndn->ndn_ha, skb->dev->addr_len); - return 0; - - discard: - - dev_kfree_skb(skb, FREE_WRITE); - return 1; } + + static int ndisc_build_ll_hdr(struct sk_buff *skb, struct device *dev, struct in6_addr *daddr, struct neighbour *neigh, int len) @@ -442,44 +262,30 @@ ndisc_build_ll_hdr(struct sk_buff *skb, struct device *dev, unsigned char ha[MAX_ADDR_LEN]; unsigned char *h_dest = NULL; - skb->arp = 1; - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) { - nd_stats.snt_probes_mcast++; - if (dev->type == ARPHRD_ETHER) - ipv6_mc_map(daddr, ha); - else - memcpy(ha, dev->broadcast, dev->addr_len); - h_dest = ha; - } else if (neigh) { - h_dest = neigh->ha; - nd_stats.snt_probes_ucast++; - } else { - struct nd_neigh *ndn; - - neigh_table_lock(&nd_tbl); - - neigh = neigh_lookup(&nd_tbl, (void *) daddr, - sizeof(struct in6_addr), dev); - if (neigh) { - ndn = (struct nd_neigh*)neigh; - if (ndn->ndn_flags&NTF_COMPLETE) { - memcpy(ha, ndn->ndn_ha, dev->addr_len); - h_dest = ha; - } + if (dev->hard_header) { + if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) { + ndisc_mc_map(daddr, ha, dev, 1); + h_dest = ha; + } else if (neigh) { + h_dest = neigh->ha; + } else { + neigh = neigh_lookup(&nd_tbl, daddr, dev); + if (neigh) { + if (neigh->nud_state&NUD_VALID) { + memcpy(ha, neigh->ha, dev->addr_len); + h_dest = ha; } - neigh_table_unlock(&nd_tbl); + neigh_release(neigh); } - - if (dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, len) < 0) - skb->arp = 0; } + + if (dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, len) < 0) + return 0; } - return skb->arp; + return 1; } @@ -487,57 +293,35 @@ ndisc_build_ll_hdr(struct sk_buff *skb, struct device *dev, * Send a Neighbour Advertisement */ -void ndisc_send_na(struct device *dev, struct nd_neigh *ndn, +void ndisc_send_na(struct device *dev, struct neighbour *neigh, struct in6_addr *daddr, struct in6_addr *solicited_addr, int router, int solicited, int override, int inc_opt) { struct sock *sk = ndisc_socket->sk; struct nd_msg *msg; - int len, opt_len; + int len; struct sk_buff *skb; int err; - NDBG(("ndisc_send_na(")); - if(dev) - NDBG(("%s,", dev->name)); - else - NDBG(("[NULL]")); - NDBG(("%p): ", ndn)); - if(daddr) - NDBG(("daddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - daddr->s6_addr16[0], daddr->s6_addr16[1], daddr->s6_addr16[2], - daddr->s6_addr16[3], daddr->s6_addr16[4], daddr->s6_addr16[5], - daddr->s6_addr16[6], daddr->s6_addr16[7])); - if(solicited_addr) - NDBG(("solicit_addr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - solicited_addr->s6_addr16[0], solicited_addr->s6_addr16[1], - solicited_addr->s6_addr16[2], solicited_addr->s6_addr16[3], - solicited_addr->s6_addr16[4], solicited_addr->s6_addr16[5], - solicited_addr->s6_addr16[6], solicited_addr->s6_addr16[7])); - NDBG(("rtr(%d)sol(%d)ovr(%d)iopt(%d)\n", router, solicited, override, inc_opt)); - - opt_len = ((dev->addr_len + 1) >> 3) + 1; len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); -#if ND_DEBUG >=1 - if (dev == NULL) { - printk(KERN_DEBUG "send_na: null device\n"); - return; + if (inc_opt) { + if (dev->addr_len) + len += NDISC_OPT_SPACE(dev->addr_len); + else + inc_opt = 0; } -#endif - if (inc_opt) - len += opt_len << 3; skb = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 0, 0, &err); if (skb == NULL) { - printk(KERN_DEBUG "send_na: alloc skb failed\n"); + ND_PRINTK1("send_na: alloc skb failed\n"); return; } - if (ndisc_build_ll_hdr(skb, dev, daddr, (struct neighbour*)ndn, len) == 0) { - kfree_skb(skb, FREE_WRITE); + if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { + kfree_skb(skb); return; } @@ -557,17 +341,8 @@ void ndisc_send_na(struct device *dev, struct nd_neigh *ndn, /* Set the target address. */ ipv6_addr_copy(&msg->target, solicited_addr); - if (inc_opt) { - /* Set the source link-layer address option. */ - msg->opt.opt_type = ND_OPT_TARGET_LL_ADDR; - msg->opt.opt_len = opt_len; - memcpy(msg->opt.link_addr, dev->dev_addr, dev->addr_len); - - if ((opt_len << 3) - (2 + dev->addr_len)) { - memset(msg->opt.link_addr + dev->addr_len, 0, - (opt_len << 3) - (2 + dev->addr_len)); - } - } + if (inc_opt) + ndisc_fill_option((void*)&msg->opt, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, dev->addr_len); /* checksum */ msg->icmph.icmp6_cksum = csum_ipv6_magic(solicited_addr, daddr, len, @@ -585,48 +360,20 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct nd_msg *msg; - int len, opt_len; + int len; int err; - NDBG(("ndisc_send_ns(%s,%p): ", (dev ? dev->name : "[NULL]"), neigh)); - if(daddr) - NDBG(("daddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - daddr->s6_addr16[0], daddr->s6_addr16[1], daddr->s6_addr16[2], - daddr->s6_addr16[3], daddr->s6_addr16[4], daddr->s6_addr16[5], - daddr->s6_addr16[6], daddr->s6_addr16[7])); - if(saddr) - NDBG(("saddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - saddr->s6_addr16[0], saddr->s6_addr16[1], saddr->s6_addr16[2], - saddr->s6_addr16[3], saddr->s6_addr16[4], saddr->s6_addr16[5], - saddr->s6_addr16[6], saddr->s6_addr16[7])); - if(solicit) - NDBG(("solicit[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - solicit->s6_addr16[0], solicit->s6_addr16[1], - solicit->s6_addr16[2], solicit->s6_addr16[3], - solicit->s6_addr16[4], solicit->s6_addr16[5], - solicit->s6_addr16[6], solicit->s6_addr16[7])); - NDBG(("\n")); - - /* length of addr in 8 octet groups.*/ - opt_len = ((dev->addr_len + 1) >> 3) + 1; - len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr) + - (opt_len << 3); + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + if (dev->addr_len) + len += NDISC_OPT_SPACE(dev->addr_len); skb = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 0, 0, &err); if (skb == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "send_ns: alloc skb failed\n"); -#endif + ND_PRINTK1("send_ns: alloc skb failed\n"); return; } -#if 0 - /* Why Pedro did it? Is it remnant of early - attempts to avoid looping back? I have no idea. --ANK */ - skb->pkt_type = PACKET_NDISC; -#endif - if (saddr == NULL) { struct inet6_ifaddr *ifa; @@ -638,12 +385,12 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, } if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - + msg = (struct nd_msg *)skb_put(skb, len); msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION; msg->icmph.icmp6_code = 0; @@ -653,16 +400,8 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, /* Set the target address. */ ipv6_addr_copy(&msg->target, solicit); - /* Set the source link-layer address option. */ - msg->opt.opt_type = ND_OPT_SOURCE_LL_ADDR; - msg->opt.opt_len = opt_len; - - memcpy(msg->opt.link_addr, dev->dev_addr, dev->addr_len); - - if ((opt_len << 3) - (2 + dev->addr_len)) { - memset(msg->opt.link_addr + dev->addr_len, 0, - (opt_len << 3) - (2 + dev->addr_len)); - } + if (dev->addr_len) + ndisc_fill_option((void*)&msg->opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, dev->addr_len); /* checksum */ msg->icmph.icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, @@ -681,40 +420,27 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, struct sk_buff *skb; struct icmp6hdr *hdr; __u8 * opt; - int len, opt_len; + int len; int err; - NDBG(("ndisc_send_rs(%s): ", (dev ? dev->name : "[NULL]"))); - if(daddr) - NDBG(("daddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - daddr->s6_addr16[0], daddr->s6_addr16[1], daddr->s6_addr16[2], - daddr->s6_addr16[3], daddr->s6_addr16[4], daddr->s6_addr16[5], - daddr->s6_addr16[6], daddr->s6_addr16[7])); - if(saddr) - NDBG(("saddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - saddr->s6_addr16[0], saddr->s6_addr16[1], saddr->s6_addr16[2], - saddr->s6_addr16[3], saddr->s6_addr16[4], saddr->s6_addr16[5], - saddr->s6_addr16[6], saddr->s6_addr16[7])); - NDBG(("\n")); - - /* length of addr in 8 octet groups.*/ - opt_len = ((dev->addr_len + 1) >> 3) + 1; - len = sizeof(struct icmp6hdr) + (opt_len << 3); + len = sizeof(struct icmp6hdr); + if (dev->addr_len) + len += NDISC_OPT_SPACE(dev->addr_len); skb = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 0, 0, &err); if (skb == NULL) { - printk(KERN_DEBUG "send_ns: alloc skb failed\n"); + ND_PRINTK1("send_ns: alloc skb failed\n"); return; } if (ndisc_build_ll_hdr(skb, dev, daddr, NULL, len) == 0) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - + hdr = (struct icmp6hdr *) skb_put(skb, len); hdr->icmp6_type = NDISC_ROUTER_SOLICITATION; hdr->icmp6_code = 0; @@ -723,16 +449,8 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, opt = (u8*) (hdr + 1); - /* Set the source link-layer address option. */ - opt[0] = ND_OPT_SOURCE_LL_ADDR; - opt[1] = opt_len; - - memcpy(opt + 2, dev->dev_addr, dev->addr_len); - - if ((opt_len << 3) - (2 + dev->addr_len)) { - memset(opt + 2 + dev->addr_len, 0, - (opt_len << 3) - (2 + dev->addr_len)); - } + if (dev->addr_len) + ndisc_fill_option(opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, dev->addr_len); /* checksum */ hdr->icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, daddr, len, @@ -744,330 +462,79 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, } -static int ndisc_store_hwaddr(struct nd_neigh *ndn, __u8 *opt, int opt_len, - int option) +static u8 * ndisc_find_option(u8 *opt, int opt_len, int len, int option) { - while (*opt != option && opt_len) { - int len; + while (opt_len <= len) { + int l = opt[1]<<3; - len = opt[1] << 3; - - if (len == 0) - { - printk(KERN_WARNING "nd: option has 0 len\n"); - return -EINVAL; + if (opt[0] == option && l >= opt_len) + return opt + 2; + + if (l == 0) { + if (net_ratelimit()) + printk(KERN_WARNING "ndisc: option has 0 len\n"); + return NULL; } - opt += len; - opt_len -= len; + opt += l; + len -= l; } - - if (*opt == option) { - memcpy(ndn->neigh.ha, opt + 2, ndn->ndn_dev->addr_len); - return 0; - } - - return -EINVAL; + return NULL; } -/* Called when a timer expires for a neighbour entry. */ -static void ndisc_timer_handler(unsigned long arg) +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { - unsigned long now = jiffies; - unsigned long ntimer = ~0UL; - int i; - - neigh_table_lock(&nd_tbl); - - for (i=0; i < nd_tbl.tbl_size; i++) { - struct nd_neigh *ndn, *head; - - head = (struct nd_neigh *) nd_tbl.hash_buckets[i]; - - if ((ndn = head) == NULL) - continue; - - do { - if (ndn->ndn_nud_state & NUD_IN_TIMER) { - unsigned long time; - - time = ndn->ndn_expires - now; - - if ((long) time <= 0) - time = ndisc_event_timer(ndn); - - if (time) - ntimer = min(ntimer, time); - } - ndn = (struct nd_neigh *) ndn->neigh.next; - } while (ndn != head); - } - - if (ntimer != (~0UL)) { - unsigned long tval = jiffies + ntimer; - if (del_timer(&ndisc_timer)) { - if (ndisc_timer.expires - tval < 0) - tval = ndisc_timer.expires; - } - ndisc_timer.expires = tval; - add_timer(&ndisc_timer); - } - - neigh_table_unlock(&nd_tbl); + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + kfree_skb(skb); } - -static int ndisc_event_timer(struct nd_neigh *ndn) +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { - struct in6_addr *daddr; - struct in6_addr *target; + struct in6_addr *saddr = NULL; struct in6_addr mcaddr; - struct device *dev; - int max_probes; - - if (ndn->ndn_nud_state == NUD_DELAY) - ndn->ndn_nud_state = NUD_PROBE; - - max_probes = (ndn->ndn_nud_state == NUD_PROBE ? - ipv6_config.nd_max_ucast_solicit: - ipv6_config.nd_max_mcast_solicit); - - if (ndn->ndn_probes == max_probes) { - struct sk_buff *skb; - - ndn->ndn_nud_state = NUD_FAILED; - ndn->ndn_flags &= ~NTF_COMPLETE; - nd_stats.res_failed++; - - while((skb=skb_dequeue(&ndn->neigh.arp_queue))) { - /* - * "The sender MUST return an ICMP - * destination unreachable" - */ - icmpv6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_ADDR_UNREACH, 0, ndn->ndn_dev); - - dev_kfree_skb(skb, FREE_WRITE); - } - return 0; - } - - ndn->ndn_probes++; - - dev = ndn->ndn_dev; - target = &ndn->ndn_addr; - - if (ndn->ndn_nud_state == NUD_INCOMPLETE) { - addrconf_addr_solict_mult(&ndn->ndn_addr, &mcaddr); - daddr = &mcaddr; - ndn = NULL; + struct device *dev = neigh->dev; + struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + int probes = neigh->probes; + + if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 0)) + saddr = &skb->nh.ipv6h->saddr; + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state&NUD_VALID)) + ND_PRINTK1("trying to ucast probe in NUD_INVALID\n"); + ndisc_send_ns(dev, neigh, target, target, saddr); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif } else { - daddr = &ndn->ndn_addr; - } - - ndisc_send_ns(dev, (struct neighbour *) ndn, target, daddr, NULL); - - return ipv6_config.nd_retrans_time; -} - -void ndisc_event_send(struct neighbour *neigh, struct sk_buff *skb) -{ - struct nd_neigh *ndn = (struct nd_neigh *) neigh; - struct in6_addr daddr; - unsigned long now = jiffies; - struct in6_addr *saddr = NULL; - - if ((ndn->ndn_flags & NCF_NOARP)) - return; - - switch (ndn->ndn_nud_state) { - case NUD_FAILED: - ndn->ndn_probes = 0; - case NUD_NONE: - if (skb && !skb->stamp.tv_sec) { - /* - * skb->stamp allows us to know if we are - * originating the skb or forwarding it. - * (it is set on netif_rx) - */ - saddr = &skb->nh.ipv6h->saddr; - } - - ndn->ndn_nud_state = NUD_INCOMPLETE; - addrconf_addr_solict_mult(&ndn->ndn_addr, &daddr); - ndisc_send_ns(ndn->ndn_dev, NULL, &ndn->ndn_addr, &daddr, - saddr); - ndisc_add_timer(ndn, ipv6_config.nd_retrans_time); - - break; - - case NUD_REACHABLE: - if ((now - ndn->ndn_tstamp) < nd_reachable_time) - break; - - case NUD_STALE: - ndn->ndn_nud_state = NUD_DELAY; - ndisc_add_timer(ndn, ipv6_config.nd_delay_probe_time); - } -} - -/* - * Received a neighbour announce - */ -void ndisc_event_na(struct nd_neigh *ndn, unsigned char *opt, int opt_len, - int solicited, int override) -{ - struct sk_buff *skb; - - NDBG(("ndisc_event_na(%p,%p,%d,%d,%d)\n", ndn, opt, opt_len, - solicited, override)); - - if (ndn->ndn_nud_state == NUD_NONE) - ndn->ndn_nud_state = NUD_INCOMPLETE; - - if (ndn->ndn_nud_state == NUD_INCOMPLETE || override) { - if (opt_len == 0) { - printk(KERN_DEBUG "no opt on NA\n"); - } else { - /* Record hardware address. */ - ndn->ndn_flags |= NTF_COMPLETE; - - if (ndisc_store_hwaddr(ndn, opt, opt_len, - ND_OPT_TARGET_LL_ADDR)) { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG - "event_na: invalid TARGET_LL_ADDR\n"); +#ifdef CONFIG_IPV6_EUI64 + addrconf_addr_solict_mult_new(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); #endif - ndn->ndn_flags &= ~NTF_COMPLETE; - ndn->ndn_nud_state = NUD_NONE; - return; - } - } - } - - if (solicited || override || ndn->ndn_nud_state == NUD_INCOMPLETE) { - ndn->ndn_probes = 0; - ndn->ndn_tstamp = jiffies; - - if (ndn->ndn_nud_state & NUD_IN_TIMER) - ndisc_del_timer(ndn); - - if (solicited) - ndn->ndn_nud_state = NUD_REACHABLE; - else - ndn->ndn_nud_state = NUD_STALE; - } - - while ((skb=skb_dequeue(&ndn->neigh.arp_queue))) - dev_queue_xmit(skb); -} - -static struct nd_neigh * ndisc_event_ns(struct in6_addr *saddr, - struct sk_buff *skb) -{ - struct nd_neigh *ndn; - u8 *opt; - int len; - - NDBG(("ndisc_event_ns: ")); - if(saddr) - NDBG(("saddr[%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x] ", - saddr->s6_addr16[0], saddr->s6_addr16[1], saddr->s6_addr16[2], - saddr->s6_addr16[3], saddr->s6_addr16[4], saddr->s6_addr16[5], - saddr->s6_addr16[6], saddr->s6_addr16[7])); - NDBG(("\n")); - - opt = skb->h.raw; - opt += sizeof(struct icmp6hdr) + sizeof(struct in6_addr); - - len = skb->tail - opt; - - neigh_table_lock(&nd_tbl); - - ndn = (struct nd_neigh *) neigh_lookup(&nd_tbl, saddr, - sizeof(struct in6_addr), - skb->dev); - - if (ndn == NULL) - ndn = ndisc_new_neigh(skb->dev, saddr); - - neigh_table_unlock(&nd_tbl); - - if (ndn == NULL) - return NULL; - - switch(ndn->ndn_nud_state) { - case NUD_REACHABLE: - case NUD_STALE: - case NUD_DELAY: - if (*opt != ND_OPT_SOURCE_LL_ADDR || - len != ndn->ndn_dev->addr_len || - memcmp(ndn->neigh.ha, opt + 2, len)) - break; - - if (ndn->ndn_nud_state & NUD_IN_TIMER) - ndisc_del_timer(ndn); - - /* FALLTHROUGH */ - default: - ndn->ndn_flags |= NTF_COMPLETE; - - if (ndisc_store_hwaddr(ndn, opt, len, ND_OPT_SOURCE_LL_ADDR)) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG - "event_ns: invalid SOURCE_LL_ADDR\n"); +#ifndef CONFIG_IPV6_NO_PB + addrconf_addr_solict_mult_old(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); #endif - - ndn->ndn_flags &= ~NTF_COMPLETE; - ndn->ndn_nud_state = NUD_NONE; - return ndn; - } - - ndn->ndn_nud_state = NUD_STALE; - ndn->ndn_tstamp = jiffies; - ndn->ndn_probes = 0; - }; - - return ndn; + } } -static void ndisc_ll_addr_update(struct nd_neigh *ndn, u8* opt, int len, - int type) +static void ndisc_update(struct neighbour *neigh, u8* opt, int len, int type) { - switch(ndn->ndn_nud_state) { - case NUD_REACHABLE: - case NUD_STALE: - case NUD_DELAY: - if (len == ndn->ndn_dev->addr_len && - memcmp(ndn->neigh.ha, opt + 2, len) == 0) - break; - - if (ndn->ndn_nud_state & NUD_IN_TIMER) - ndisc_del_timer(ndn); - default: - ndn->ndn_flags |= NTF_COMPLETE; - - if (ndisc_store_hwaddr(ndn, opt, len, type)) { -#if ND_DEBUG >=1 - printk(KERN_DEBUG "NDISC: invalid LL_ADDR\n"); -#endif - ndn->ndn_flags &= ~NTF_COMPLETE; - ndn->ndn_nud_state = NUD_NONE; - break; - } - - ndn->ndn_nud_state = NUD_STALE; - ndn->ndn_tstamp = jiffies; - ndn->ndn_probes = 0; - }; + opt = ndisc_find_option(opt, neigh->dev->addr_len+2, len, type); + neigh_update(neigh, opt, NUD_STALE, 1, 1); } static void ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw; - struct nd_neigh *ndn; + struct neighbour *neigh; struct inet6_dev *in6_dev; struct rt6_info *rt; int lifetime; @@ -1075,8 +542,6 @@ static void ndisc_router_discovery(struct sk_buff *skb) __u8 * opt = (__u8 *)(ra_msg + 1); - NDBG(("ndisc_router_discovery(%p)\n", skb)); - optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg); if (skb->nh.ipv6h->hop_limit != 255) { @@ -1091,10 +556,12 @@ static void ndisc_router_discovery(struct sk_buff *skb) in6_dev = ipv6_get_idev(skb->dev); if (in6_dev == NULL) { - printk(KERN_DEBUG "RA: can't find in6 device\n"); + ND_PRINTK1("RA: can't find in6 device\n"); return; } - + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + return; + if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent @@ -1113,65 +580,61 @@ static void ndisc_router_discovery(struct sk_buff *skb) } if (rt == NULL && lifetime) { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "ndisc_rdisc: adding default router\n"); -#endif + ND_PRINTK2("ndisc_rdisc: adding default router\n"); rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); - if (rt == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "route_add failed\n"); +#if 1 + /* BUGGGGG! Previous routine can return invalid pointer. */ + rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); #endif + if (rt == NULL) { + ND_PRINTK1("route_add failed\n"); return; } - ndn = (struct nd_neigh *) rt->rt6i_nexthop; - if (ndn == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "nd: add default router: null " - "neighbour\n"); -#endif + neigh = rt->rt6i_nexthop; + if (neigh == NULL) { + ND_PRINTK1("nd: add default router: null neighbour\n"); return; } - ndn->ndn_flags |= NCF_ROUTER; + neigh->flags |= NTF_ROUTER; } if (rt) rt->rt6i_expires = jiffies + (HZ * lifetime); if (ra_msg->icmph.icmp6_hop_limit) - ipv6_config.hop_limit = ra_msg->icmph.icmp6_hop_limit; + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; /* * Update Reachable Time and Retrans Timer */ - if (ra_msg->retrans_timer) - ipv6_config.nd_retrans_time = ntohl(ra_msg->retrans_timer); + if (in6_dev->nd_parms) { + if (ra_msg->retrans_timer) + in6_dev->nd_parms->retrans_time = (ntohl(ra_msg->retrans_timer)*HZ)/1000; - if (ra_msg->reachable_time) { - __u32 rtime = ntohl(ra_msg->reachable_time); + if (ra_msg->reachable_time) { + __u32 rtime = (ntohl(ra_msg->reachable_time)*HZ)/1000; - if (rtime != ipv6_config.nd_base_reachable_time) { - ipv6_config.nd_base_reachable_time = rtime; - nd_gc_staletime = 3 * rtime; - nd_reachable_time = rand_reach_time(); + if (rtime != in6_dev->nd_parms->base_reachable_time) { + in6_dev->nd_parms->base_reachable_time = rtime; + in6_dev->nd_parms->gc_staletime = 3 * rtime; + in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + } } - } /* * Process options. */ - while(optlen > 0) { - int len; - - len = (opt[1] << 3); + while (optlen > 0) { + int len = (opt[1] << 3); if (len == 0) { - printk(KERN_DEBUG "RA: opt has 0 len\n"); + ND_PRINTK0("RA: opt has 0 len\n"); break; } @@ -1181,11 +644,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt == NULL) break; - ndn = (struct nd_neigh *) rt->rt6i_nexthop; - - if (ndn) - ndisc_ll_addr_update(ndn, opt, len, - ND_OPT_SOURCE_LL_ADDR); + if ((neigh = rt->rt6i_nexthop) != NULL && + skb->dev->addr_len + 2 >= len) + neigh_update(neigh, opt+2, NUD_STALE, 1, 1); break; case ND_OPT_PREFIX_INFO: @@ -1193,71 +654,54 @@ static void ndisc_router_discovery(struct sk_buff *skb) break; case ND_OPT_MTU: - if (rt) { + { int mtu; - struct device *dev; mtu = htonl(*(__u32 *)(opt+4)); - dev = rt->rt6i_dev; - if (dev == NULL) - break; - - if (mtu < 576) { - printk(KERN_DEBUG "NDISC: router " - "announcement with mtu = %d\n", - mtu); + if (mtu < 576 || mtu > skb->dev->mtu) { + ND_PRINTK0("NDISC: router " + "announcement with mtu = %d\n", + mtu); break; } - if (dev->change_mtu) - dev->change_mtu(dev, mtu); - else - dev->mtu = mtu; + if (in6_dev->cnf.mtu6 != mtu) { + in6_dev->cnf.mtu6 = mtu; + + if (rt) + rt->u.dst.pmtu = mtu; + + /* BUGGG... Scan routing tables and + adjust mtu on routes going + via this device + */ + } } break; case ND_OPT_TARGET_LL_ADDR: case ND_OPT_REDIRECT_HDR: - printk(KERN_DEBUG "got illegal option with RA"); + ND_PRINTK0("got illegal option with RA"); break; default: - printk(KERN_DEBUG "unkown option in RA\n"); + ND_PRINTK0("unkown option in RA\n"); }; optlen -= len; opt += len; } } -void ndisc_forwarding_on(void) -{ - - /* - * Forwarding was turned on. - */ - - rt6_purge_dflt_routers(0); -} - -void ndisc_forwarding_off(void) -{ - /* - * Forwarding was turned off. - */ -} - static void ndisc_redirect_rcv(struct sk_buff *skb) { + struct inet6_dev *in6_dev; struct icmp6hdr *icmph; struct in6_addr *dest; struct in6_addr *target; /* new first hop to destination */ - struct nd_neigh *ndn; + struct neighbour *neigh; struct rt6_info *rt; int on_link = 0; int optlen; - u8 * opt; - - NDBG(("ndisc_redirect_rcv(%p)\n", skb)); if (skb->nh.ipv6h->hop_limit != 255) { printk(KERN_WARNING "NDISC: fake ICMP redirect received\n"); @@ -1293,28 +737,24 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - /* passed validation tests */ - rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link); - - if (rt == NULL) + in6_dev = ipv6_get_idev(skb->dev); + if (!in6_dev || in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) return; - ndn = (struct nd_neigh *) rt->rt6i_nexthop; - - opt = (u8 *) (dest + 1); - - while (optlen > 0) { - int len; + /* passed validation tests - len = (opt[1] << 3); + NOTE We should not install redirect if sender did not supply + ll address on link, which requires it. It would break, if + we have non-transitive address resolution protocol. + Fix it later. --ANK + */ + rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link); - if (*opt == ND_OPT_TARGET_LL_ADDR) - ndisc_ll_addr_update(ndn, opt, len, - ND_OPT_TARGET_LL_ADDR); + if (rt == NULL) + return; - opt += len; - optlen -= len; - } + neigh = rt->rt6i_nexthop; + ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR); } void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, @@ -1323,13 +763,11 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, struct sock *sk = ndisc_socket->sk; int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); struct sk_buff *buff; - struct nd_neigh *ndn = (struct nd_neigh *) neigh; struct inet6_ifaddr *ifp; struct icmp6hdr *icmph; struct in6_addr *addrp; struct device *dev; struct rt6_info *rt; - int ta_len = 0; u8 *opt; int rd_len; int err; @@ -1339,22 +777,25 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev, 0); if (rt == NULL || rt->u.dst.error) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "ndisc_send_redirect: hostunreach\n"); -#endif + ND_PRINTK1("ndisc_send_redirect: hostunreach\n"); return; } if (rt->rt6i_flags & RTF_GATEWAY) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "ndisc_send_redirect: not a neighbour\n"); -#endif + ND_PRINTK1("ndisc_send_redirect: not a neighbour\n"); return; } - if (ndn->ndn_nud_state == NUD_REACHABLE) { - ta_len = ((dev->addr_len + 1) >> 3) + 1; - len += (ta_len << 3); + if (dev->addr_len) { + if (neigh->nud_state&NUD_VALID) { + len += NDISC_OPT_SPACE(dev->addr_len); + } else { + /* If nexthop is not valid, do not redirect! + We will make it later, when will be sure, + that it is alive. + */ + return; + } } rd_len = min(536 - len, ntohs(skb->nh.ipv6h->payload_len) + 8); @@ -1364,25 +805,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ifp = ipv6_get_lladdr(dev); if (ifp == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "redirect: no link_local addr for dev\n"); -#endif + ND_PRINTK1("redirect: no link_local addr for dev\n"); return; } buff = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 0, 0, &err); if (buff == NULL) { -#if ND_DEBUG >= 2 - printk(KERN_DEBUG "ndisc_send_redirect: alloc_skb failed\n"); -#endif + ND_PRINTK1("ndisc_send_redirect: alloc_skb failed\n"); return; } hlen = 0; if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) { - kfree_skb(buff, FREE_WRITE); + kfree_skb(buff); return; } @@ -1409,29 +846,8 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, * include target_address option */ - if (ta_len) { - int zb; - - *(opt++) = ND_OPT_TARGET_LL_ADDR; - *(opt++) = ta_len; - - memcpy(opt, neigh->ha, neigh->dev->addr_len); - opt += neigh->dev->addr_len; - - /* - * if link layer address doesn't end on a 8 byte - * boundary memset(0) the remider - */ - - zb = (neigh->dev->addr_len + 2) & 0x7; - if (zb) { - int comp; - - comp = 8 - zb; - memset(opt, 0, comp); - opt += comp; - } - } + if (dev->addr_len) + opt = ndisc_fill_option(opt, ND_OPT_TARGET_LL_ADDR, neigh->ha, dev->addr_len); /* * build redirect option and copy skb over to the new packet. @@ -1451,24 +867,37 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, dev_queue_xmit(buff); } -/* Called by upper layers to validate neighbour cache entries. */ - -void ndisc_validate(struct neighbour *neigh) +static __inline__ struct neighbour * +ndisc_recv_ns(struct in6_addr *saddr, struct sk_buff *skb) { - struct nd_neigh *ndn = (struct nd_neigh *) neigh; + u8 *opt; - if (neigh == NULL) - return; + opt = skb->h.raw; + opt += sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + opt = ndisc_find_option(opt, skb->dev->addr_len+2, skb->tail - opt, ND_OPT_SOURCE_LL_ADDR); + + return neigh_event_ns(&nd_tbl, opt, saddr, skb->dev); +} + +static __inline__ int ndisc_recv_na(struct neighbour *neigh, struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *) skb->h.raw; + u8 *opt; - if (ndn->ndn_nud_state == NUD_INCOMPLETE) - return; + opt = skb->h.raw; + opt += sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + opt = ndisc_find_option(opt, skb->dev->addr_len+2, skb->tail - opt, ND_OPT_TARGET_LL_ADDR); - if (ndn->ndn_nud_state == NUD_DELAY) - ndisc_del_timer(ndn); + return neigh_update(neigh, opt, + msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + msg->icmph.icmp6_override, 1); +} - nd_stats.rcv_upper_conf++; - ndn->ndn_nud_state = NUD_REACHABLE; - ndn->ndn_tstamp = jiffies; +static void pndisc_redo(struct sk_buff *skb) +{ + ndisc_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + NULL, skb->len); + kfree_skb(skb); } int ndisc_rcv(struct sk_buff *skb, struct device *dev, @@ -1476,27 +905,24 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, struct ipv6_options *opt, unsigned short len) { struct nd_msg *msg = (struct nd_msg *) skb->h.raw; - struct nd_neigh *ndn; + struct neighbour *neigh; struct inet6_ifaddr *ifp; - NDBG(("ndisc_rcv(type=%d) ", msg->icmph.icmp6_type)); switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: - NDBG(("NS ")); - if ((ifp = ipv6_chk_addr(&msg->target)) != NULL) { + if ((ifp = ipv6_chk_addr(&msg->target, dev, 1)) != NULL) { int addr_type = ipv6_addr_type(saddr); + if (ifp->flags & ADDR_INVALID) + return 0; if (ifp->flags & DAD_INCOMPLETE) { /* Address is tentative. If the source is unspecified address, it is someone does DAD, otherwise we ignore solicitations until DAD timer expires. */ - if (addr_type == IPV6_ADDR_ANY) { - printk(KERN_INFO "%s: duplicate address detected!\n", - ifp->idev->dev->name); - del_timer(&ifp->timer); - } + if (addr_type == IPV6_ADDR_ANY) + addrconf_dad_failure(ifp); return 0; } @@ -1505,51 +931,80 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, ipv6_addr_all_nodes(&maddr); ndisc_send_na(dev, NULL, &maddr, &ifp->addr, - ifp->idev->router, 0, 1, 1); + ifp->idev->cnf.forwarding, 0, 1, 1); return 0; } if (addr_type & IPV6_ADDR_UNICAST) { - int inc; + int inc = ipv6_addr_type(daddr)&IPV6_ADDR_MULTICAST; + + if (inc) + nd_tbl.stats.rcv_probes_mcast++; + else + nd_tbl.stats.rcv_probes_ucast++; /* * update / create cache entry * for the source adddress */ - nd_stats.rcv_probes_ucast++; + neigh = ndisc_recv_ns(saddr, skb); - ndn = ndisc_event_ns(saddr, skb); + if (neigh) { + ndisc_send_na(dev, neigh, saddr, &ifp->addr, + ifp->idev->cnf.forwarding, 1, inc, inc); + neigh_release(neigh); + } + } + } else { + struct inet6_dev *in6_dev = ipv6_get_idev(dev); + int addr_type = ipv6_addr_type(saddr); - if (ndn == NULL) + if (in6_dev && in6_dev->cnf.forwarding && + (addr_type & IPV6_ADDR_UNICAST) && + pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) { + int inc = ipv6_addr_type(daddr)&IPV6_ADDR_MULTICAST; + + if (skb->stamp.tv_sec == 0 || + skb->pkt_type == PACKET_HOST || + inc == 0 || + in6_dev->nd_parms->proxy_delay == 0) { + if (inc) + nd_tbl.stats.rcv_probes_mcast++; + else + nd_tbl.stats.rcv_probes_ucast++; + + neigh = ndisc_recv_ns(saddr, skb); + + if (neigh) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + 1, 0, inc, inc); + neigh_release(neigh); + } + } else { + /* Hack. It will be freed upon exit from + ndisc_rcv + */ + atomic_inc(&skb->users); + pneigh_enqueue(&nd_tbl, in6_dev->nd_parms, skb); return 0; - - inc = ipv6_addr_type(daddr); - inc &= IPV6_ADDR_MULTICAST; - - ndisc_send_na(dev, ndn, saddr, &ifp->addr, - ifp->idev->router, 1, inc, inc); - } else { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "ns: non unicast saddr\n"); -#endif + } } } - break; + return 0; case NDISC_NEIGHBOUR_ADVERTISEMENT: - NDBG(("NA ")); if ((ipv6_addr_type(saddr)&IPV6_ADDR_MULTICAST) && msg->icmph.icmp6_solicited) { - printk(KERN_DEBUG "NDISC: solicited NA is multicasted\n"); + ND_PRINTK0("NDISC: solicited NA is multicasted\n"); return 0; } - if ((ifp = ipv6_chk_addr(&msg->target))) { + /* BUG! Target can be link-local on ANOTHER interface. Fixed. */ + if ((ifp = ipv6_chk_addr(&msg->target, dev, 1))) { + if (ifp->flags & ADDR_INVALID) + return 0; if (ifp->flags & DAD_INCOMPLETE) { - /* Address is duplicate. */ - printk(KERN_INFO "%s: duplicate address detected!\n", - ifp->idev->dev->name); - del_timer(&ifp->timer); + addrconf_dad_failure(ifp); return 0; } /* What should we make now? The advertisement @@ -1557,18 +1012,14 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) */ - printk(KERN_DEBUG "%s: someone avertise our address!\n", - ifp->idev->dev->name); + ND_PRINTK0("%s: someone avertise our address!\n", + ifp->idev->dev->name); return 0; } - neigh_table_lock(&nd_tbl); - ndn = (struct nd_neigh *) - neigh_lookup(&nd_tbl, (void *) &msg->target, - sizeof(struct in6_addr), skb->dev); - neigh_table_unlock(&nd_tbl); - - if (ndn) { - if (ndn->ndn_flags & NCF_ROUTER) { + neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 0); + + if (neigh) { + if (neigh->flags & NTF_ROUTER) { if (msg->icmph.icmp6_router == 0) { /* * Change: router to host @@ -1583,99 +1034,91 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, } } else { if (msg->icmph.icmp6_router) - ndn->ndn_flags |= NCF_ROUTER; + neigh->flags |= NTF_ROUTER; } - ndisc_event_na(ndn, (unsigned char *) &msg->opt, - skb->tail - (u8 *)&msg->opt /*opt_len*/, - msg->icmph.icmp6_solicited, - msg->icmph.icmp6_override); + + ndisc_recv_na(neigh, skb); + neigh_release(neigh); } break; - }; - - if (ipv6_config.forwarding == 0) { - switch (msg->icmph.icmp6_type) { - case NDISC_ROUTER_ADVERTISEMENT: - NDBG(("RA ")); - if (ipv6_config.accept_ra) - ndisc_router_discovery(skb); - break; + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; - case NDISC_REDIRECT: - NDBG(("REDIR ")); - if (ipv6_config.accept_redirects) - ndisc_redirect_rcv(skb); - break; - }; - } + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + }; return 0; } #ifdef CONFIG_PROC_FS -int ndisc_get_info(char *buffer, char **start, off_t offset, int length, - int dummy) +#ifndef CONFIG_RTNETLINK +int ndisc_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { + int len=0; + off_t pos=0; + int size; unsigned long now = jiffies; - int len = 0; int i; neigh_table_lock(&nd_tbl); - for (i = 0; i < nd_tbl.tbl_size; i++) { - struct neighbour *neigh, *head; - head = nd_tbl.hash_buckets[i]; - - if ((neigh = head) == NULL) - continue; + for (i = 0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *neigh; - do { - struct nd_neigh *ndn = (struct nd_neigh *) neigh; + for (neigh = nd_tbl.hash_buckets[i]; neigh; neigh = neigh->next) { int j; + size = 0; for (j=0; j<16; j++) { - sprintf(buffer + len, "%02x", - ndn->ndn_addr.s6_addr[j]); - len += 2; + sprintf(buffer+len+size, "%02x", neigh->primary_key[j]); + size += 2; } - len += sprintf(buffer + len, - " %02x %02x %02x %02x %08lx %08lx %08lx %04x %04x %04lx %8s ", i, - ndn->ndn_plen, - ndn->ndn_type, - ndn->ndn_nud_state, - ndn->ndn_expires ? ndn->ndn_expires - now : 0, - now - ndn->ndn_tstamp, - nd_reachable_time, - nd_gc_staletime, - atomic_read(&ndn->ndn_refcnt), - ndn->ndn_flags, - ndn->ndn_dev ? ndn->ndn_dev->name : "NULLDEV"); - - if ((ndn->ndn_flags & NTF_COMPLETE)) { - for (j=0; j< neigh->dev->addr_len; j++) { - sprintf(buffer + len, "%02x", - neigh->ha[j]); - len += 2; + size += sprintf(buffer+len+size, + " %02x %02x %02x %02x %08lx %08lx %08x %04x %04x %04x %8s ", i, + 128, + neigh->type, + neigh->nud_state, + now - neigh->used, + now - neigh->confirmed, + neigh->parms->reachable_time, + neigh->parms->gc_staletime, + atomic_read(&neigh->refcnt), + neigh->flags | (!neigh->hh ? 0 : (neigh->hh->hh_output==dev_queue_xmit ? 4 : 2)), + neigh->dev->name); + + if ((neigh->nud_state&NUD_VALID) && neigh->dev->addr_len) { + for (j=0; j < neigh->dev->addr_len; j++) { + sprintf(buffer+len+size, "%02x", neigh->ha[j]); + size += 2; } } else { - len += sprintf(buffer + len, "000000000000"); + size += sprintf(buffer+len+size, "000000000000"); } - len += sprintf(buffer + len, "\n"); - - neigh = neigh->next; - } while (neigh != head); + size += sprintf(buffer+len+size, "\n"); + len += size; + pos += size; + + if (pos <= offset) + len=0; + if (pos >= offset+length) + goto done; + } } +done: neigh_table_unlock(&nd_tbl); - - *start = buffer + offset; - - len -= offset; - if (len > length) - len = length; + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; /* Start slop */ + if (len>length) + len = length; /* Ending slop */ + if (len<0) + len = 0; return len; } @@ -1686,8 +1129,11 @@ struct proc_dir_entry ndisc_proc_entry = 0, NULL, &ndisc_get_info }; +#endif #endif /* CONFIG_PROC_FS */ + + __initfunc(void ndisc_init(struct net_proto_family *ops)) { struct sock *sk; @@ -1700,52 +1146,47 @@ __initfunc(void ndisc_init(struct net_proto_family *ops)) ndisc_socket->inode = &ndisc_inode; ndisc_socket->state = SS_UNCONNECTED; - ndisc_socket->type=SOCK_RAW; + ndisc_socket->type = SOCK_RAW; if((err=ops->create(ndisc_socket, IPPROTO_ICMPV6))<0) printk(KERN_DEBUG "Failed to create the NDISC control socket.\n"); + /* Eeeh... What is it? --ANK */ MOD_DEC_USE_COUNT; sk = ndisc_socket->sk; sk->allocation = GFP_ATOMIC; sk->net_pinfo.af_inet6.hop_limit = 255; sk->net_pinfo.af_inet6.priority = 15; + /* Do not loopback ndisc messages */ + sk->net_pinfo.af_inet6.mc_loop = 0; sk->num = 256; /* * Initialize the neighbour table */ - neigh_table_init(&nd_tbl, &nd_neigh_ops, NCACHE_NUM_BUCKETS); - - /* General ND state machine timer. */ - init_timer(&ndisc_timer); - ndisc_timer.function = ndisc_timer_handler; - ndisc_timer.data = 0L; - ndisc_timer.expires = 0L; - - /* ND GC timer */ - init_timer(&ndisc_gc_timer); - ndisc_gc_timer.function = ndisc_periodic_timer; - ndisc_gc_timer.data = 0L; - ndisc_gc_timer.expires = jiffies + nd_gc_interval; - - add_timer(&ndisc_gc_timer); + neigh_table_init(&nd_tbl); #ifdef CONFIG_PROC_FS +#ifndef CONFIG_RTNETLINK proc_net_register(&ndisc_proc_entry); #endif +#endif +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, "ipv6"); +#endif } #ifdef MODULE void ndisc_cleanup(void) { #ifdef CONFIG_PROC_FS +#ifndef CONFIG_RTNETLINK proc_net_unregister(ndisc_proc_entry.low_ino); #endif - del_timer(&ndisc_gc_timer); - del_timer(&ndisc_timer); +#endif + neigh_table_clear(&nd_tbl); } #endif diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 17af36fe6..4ee1b13ad 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.13 1997/09/14 08:32:14 davem Exp $ + * $Id: raw.c,v 1.16 1997/12/29 19:52:48 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -25,6 +25,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> +#include <asm/uaccess.h> #include <net/sock.h> #include <net/snmp.h> @@ -98,7 +99,7 @@ static void raw_v6_rehash(struct sock *sk) SOCKHASH_UNLOCK(); } -static int __inline__ inet6_mc_check(struct sock *sk, struct in6_addr *addr) +static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr) { struct ipv6_mc_socklist *mc; @@ -165,7 +166,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (ipv6_chk_addr(&addr->sin6_addr) == NULL) + if (ipv6_chk_addr(&addr->sin6_addr, NULL, 0) == NULL) return(-EADDRNOTAVAIL); } } @@ -193,7 +194,7 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk,skb)<0) { /* ip_statistics.IpInDiscards++; */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -235,13 +236,11 @@ int rawv6_rcv(struct sk_buff *skb, struct device *dev, */ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) + int noblock, int flags, int *addr_len) { - struct sockaddr_in6 *sin6=(struct sockaddr_in6 *)msg->msg_name; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; struct sk_buff *skb; - int copied=0; - int err; - + int copied, err; if (flags & MSG_OOB) return -EOPNOTSUPP; @@ -252,32 +251,32 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (addr_len) *addr_len=sizeof(*sin6); - skb=skb_recv_datagram(sk, flags, noblock, &err); - if(skb==NULL) - return err; + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; copied = min(len, skb->tail - skb->h.raw); err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); sk->stamp=skb->stamp; - if (err) - return err; + goto out_free; /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; memcpy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr, sizeof(struct in6_addr)); - - *addr_len = sizeof(struct sockaddr_in6); } if (msg->msg_controllen) datagram_recv_ctl(sk, msg, skb); + err = copied; +out_free: skb_free_datagram(sk, skb); - return (copied); +out: + return err; } /* @@ -359,7 +358,15 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) int hlimit = -1; u16 proto; int err; - + + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + + When jumbo header will be implemeted we will remove it + at all (len will be size_t) + */ + if (len < 0 || len > 0xFFFF) + return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) @@ -389,9 +396,12 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) daddr = &sin6->sin6_addr; - if (np->dst && ipv6_addr_cmp(daddr, &np->daddr)) { - dst_release(np->dst); - np->dst = NULL; + /* BUGGGG If route is not cloned, this check always + fails, hence dst_cache only slows down tramsmission --ANK + */ + if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) { + dst_release(sk->dst_cache); + sk->dst_cache = NULL; } } else { if (sk->state != TCP_ESTABLISHED) @@ -409,12 +419,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) return(-EINVAL); } - /* - * We don't allow > 64K sends yet. - */ - if (len + (sk->ip_hdrincl ? 0 : sizeof(struct ipv6hdr)) > 65535) - return -EMSGSIZE; - if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_options)); @@ -592,14 +596,9 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, static void rawv6_close(struct sock *sk, unsigned long timeout) { - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - sk->state = TCP_CLOSE; - - if (np->dst) - dst_release(np->dst); - ipv6_sock_mc_close(sk); + sk->dead = 1; destroy_sock(sk); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 35aa41b95..aa027da14 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: reassembly.c,v 1.7 1997/03/18 18:24:47 davem Exp $ + * $Id: reassembly.c,v 1.8 1997/12/29 19:52:50 kuznet Exp $ * * Based on: net/ipv4/ip_fragment.c * @@ -112,7 +112,7 @@ static void fq_free(struct frag_queue *fq) struct ipv6_frag *fp, *back; for(fp = fq->fragments; fp; ) { - kfree_skb(fp->skb, FREE_READ); + kfree_skb(fp->skb); back = fp; fp=fp->next; kfree(back); @@ -159,7 +159,7 @@ static void create_frag_entry(struct sk_buff *skb, struct device *dev, GFP_ATOMIC); if (fq == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return; } @@ -201,7 +201,7 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, GFP_ATOMIC); if (nfp == NULL) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return; } @@ -230,7 +230,7 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, } /* duplicate. discard it. */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); kfree(nfp); return; } @@ -273,7 +273,9 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) payload_len = (unfrag_len + tail->offset + (tail->skb->tail - (__u8 *) (tail->fhdr + 1))); +#if 0 printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len); +#endif if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) { printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n"); @@ -306,7 +308,7 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) struct ipv6_frag *back; memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len); - kfree_skb(fp->skb, FREE_READ); + kfree_skb(fp->skb); back = fp; fp=fp->next; kfree(back); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6a412d423..28ee43e78 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.18 1997/10/17 00:15:05 freitag Exp $ + * $Id: route.c,v 1.19 1997/12/13 21:53:16 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -37,9 +37,14 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <linux/netlink.h> +#include <linux/rtnetlink.h> #include <asm/uaccess.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + #undef CONFIG_RT6_POLICY /* Set to 3 to get tracing. */ @@ -51,25 +56,41 @@ #define RDBG(x) #endif +int ip6_rt_max_size = 4096; +int ip6_rt_gc_min_interval = 5*HZ; +int ip6_rt_gc_timeout = 60*HZ; +int ip6_rt_gc_interval = 30*HZ; + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb); +static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static int ip6_dst_gc(void); static int ip6_pkt_discard(struct sk_buff *skb); +static void ip6_link_failure(struct sk_buff *skb); struct dst_ops ip6_dst_ops = { AF_INET6, + __constant_htons(ETH_P_IPV6), + 1024, + + ip6_dst_gc, ip6_dst_check, ip6_dst_reroute, - NULL + NULL, + ip6_negative_advice, + ip6_link_failure, }; struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, + -1, 0, 0, 0, 0, 0, 0, 0, 0, + -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, - NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0UL, - 0, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} + NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, + 0, 255, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} }; struct fib6_node ip6_routing_table = { @@ -187,6 +208,7 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, struct device *dev, int strict) { + struct rt6_info *local = NULL; struct rt6_info *sprt; RDBG(("rt6_device_match: (%p,%p,%d) ", rt, dev, strict)); @@ -196,8 +218,13 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, RDBG(("match --> %p\n", sprt)); return sprt; } + if (sprt->rt6i_dev && (sprt->rt6i_dev->flags&IFF_LOOPBACK)) + local = sprt; } + if (local) + return local; + if (strict) { RDBG(("nomatch & STRICT --> ip6_null_entry\n")); return &ip6_null_entry; @@ -220,14 +247,14 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, struct device *dev) RDBG(("rt6_best_dflt(%p,%p): ", rt, dev)); for (sprt = rt; sprt; sprt = sprt->u.next) { - struct nd_neigh *ndn; + struct neighbour *neigh; RDBG(("sprt(%p): ", sprt)); - if ((ndn = (struct nd_neigh *) sprt->rt6i_nexthop)) { + if ((neigh = sprt->rt6i_nexthop)) { int m = -1; - RDBG(("nxthop(%p,%d) ", ndn, ndn->ndn_nud_state)); - switch (ndn->ndn_nud_state) { + RDBG(("nxthop(%p,%d) ", neigh, neigh->nud_state)); + switch (neigh->nud_state) { case NUD_REACHABLE: RDBG(("NUD_REACHABLE ")); if (sprt != rt6_dflt_pointer) { @@ -304,14 +331,16 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, return rt; } -static struct rt6_info *rt6_cow(struct rt6_info *rt, struct in6_addr *daddr, +static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, struct in6_addr *saddr) { + struct rt6_info *rt; + /* * Clone the route. */ - rt = ip6_rt_copy(rt); + rt = ip6_rt_copy(ort); if (rt) { ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); @@ -375,6 +404,8 @@ void ip6_route_input(struct sk_buff *skb) struct dst_entry *dst; RDBG(("ip6_route_input(%p) from %p\n", skb, __builtin_return_address(0))); + if ((dst = skb->dst) != NULL) + goto looped_back; rt6_lock(); fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr); @@ -420,6 +451,7 @@ out: rt6_unlock(); skb->dst = dst; +looped_back: dst->input(skb); } @@ -432,7 +464,7 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) RDBG(("ip6_route_output(%p,%p) from(%p)", sk, fl, __builtin_return_address(0))); - strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & IPV6_ADDR_MULTICAST; + strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); rt6_lock(); #if RT6_DEBUG >= 3 @@ -461,12 +493,28 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) RDBG(("-->(%p[%s])) ", fn, fn == &ip6_routing_table ? "ROOT" : "!ROOT")); +restart: rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { RDBG(("RTF_CACHE ")); if (ip6_rt_policy == 0) { rt = rt6_device_match(rt, fl->dev, strict); + + /* BUGGGG! It is capital bug, that was hidden + by not-cloning multicast routes. However, + the same problem was with link-local addresses. + Fix is the following if-statement, + but it will not properly handle Pedro's subtrees --ANK + */ + if (rt == &ip6_null_entry && strict) { + while ((fn = fn->parent) != NULL) { + if (fn->fn_flags & RTN_ROOT) + goto out; + if (fn->fn_flags & RTN_RTINFO) + goto restart; + } + } RDBG(("devmatch(%p) ", rt)); goto out; } @@ -517,7 +565,7 @@ out: } -void rt6_ins(struct rt6_info *rt) +static void rt6_ins(struct rt6_info *rt) { start_bh_atomic(); if (atomic_read(&rt6_tbl_lock) == 1) @@ -529,29 +577,33 @@ void rt6_ins(struct rt6_info *rt) /* * Destination cache support functions + * + * BUGGG! This function is absolutely wrong. + * First of all it is never called. (look at include/net/dst.h) + * Second, even when it is called rt->rt6i_node == NULL + * ** partially fixed: now dst->obsolete = -1 for IPv6 not cache routes. + * Third, even we fixed previous bugs, + * it will not work because sernum is incorrectly checked/updated and + * it does not handle change of the parent of cloned route. + * Purging stray clones is not easy task, it would require + * massive remake of ip6_fib.c. Alas... + * --ANK */ -struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; - RDBG(("ip6dstchk(%p,%08x)[%p]\n", dst, cookie, - __builtin_return_address(0))); - rt = (struct rt6_info *) dst; - if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { - if (rt->rt6i_nexthop) - ndisc_event_send(rt->rt6i_nexthop, NULL); - + if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) return dst; - } dst_release(dst); return NULL; } -struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) +static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) { /* * FIXME @@ -561,6 +613,39 @@ struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) return NULL; } +static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +{ + dst_release(dst); + return NULL; +} + +static void ip6_link_failure(struct sk_buff *skb) +{ + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); +} + +static int ip6_dst_gc() +{ + static unsigned expire = 30*HZ; + static unsigned long last_gc; + unsigned long now = jiffies; + + start_bh_atomic(); + if ((long)(now - last_gc) < ip6_rt_gc_min_interval) + goto out; + + expire++; + fib6_run_gc(expire); + last_gc = now; + if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) + expire = ip6_rt_gc_timeout; + +out: + expire >>= 1; + end_bh_atomic(); + return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size); +} + /* Clean host part of a prefix. Not necessary in radix tree, but results in cleaner routing tables. @@ -578,6 +663,28 @@ static void ipv6_wash_prefix(struct in6_addr *pfx, int plen) pfx->s6_addr[plen>>3] &= (0xFF<<(8-b)); } +static int ipv6_get_mtu(struct device *dev) +{ + struct inet6_dev *idev; + + idev = ipv6_get_idev(dev); + if (idev) + return idev->cnf.mtu6; + else + return 576; +} + +static int ipv6_get_hoplimit(struct device *dev) +{ + struct inet6_dev *idev; + + idev = ipv6_get_idev(dev); + if (idev) + return idev->cnf.hop_limit; + else + return ipv6_devconf.hop_limit; +} + /* * */ @@ -592,6 +699,8 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) *err = -EINVAL; return NULL; } + if (rtmsg->rtmsg_metric == 0) + rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; *err = 0; @@ -603,6 +712,9 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } + rt->u.dst.obsolete = -1; + rt->rt6i_expires = rtmsg->rtmsg_info; + addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); if (addr_type & IPV6_ADDR_MULTICAST) { @@ -613,7 +725,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->u.dst.input = ip6_forward; } - rt->u.dst.output = dev_queue_xmit; + rt->u.dst.output = ip6_output; if (rtmsg->rtmsg_ifindex) { dev = dev_get_by_index(rtmsg->rtmsg_ifindex); @@ -665,9 +777,16 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) *err = -EINVAL; goto out; } + } - rt->rt6i_nexthop = ndisc_get_neigh(dev, gw_addr); + if (dev == NULL) { + RDBG(("!dev, ")); + *err = -ENODEV; + goto out; + } + if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) { + rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { RDBG(("!nxthop, ")); *err = -ENOMEM; @@ -676,16 +795,14 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) RDBG(("nxthop, ")); } - if (dev == NULL) { - RDBG(("!dev, ")); - *err = -ENODEV; - goto out; - } - rt->rt6i_metric = rtmsg->rtmsg_metric; rt->rt6i_dev = dev; - rt->u.dst.pmtu = dev->mtu; + rt->u.dst.pmtu = ipv6_get_mtu(dev); + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) + rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS; + else + rt->rt6i_hoplimit = ipv6_get_hoplimit(dev); rt->rt6i_flags = rtmsg->rtmsg_flags; RDBG(("rt6ins(%p) ", rt)); @@ -694,6 +811,29 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt6_ins(rt); rt6_unlock(); + /* BUGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG! + + If rt6_ins will fail (and it occurs regularly f.e. if route + already existed), the route will be freed -> Finita. + Crash. No recovery. NO FIX. Unfortunately, it is not the only + place will it is fatal. It is sad, I believed this + code is a bit more accurate :-( + + Really, the problem can be solved in two ways: + + * As I did in old 2.0 IPv4: to increase use count and force + user to destroy stray route. It requires some care, + well, much more care. + * Second and the best: to get rid of this damn backlogging + system. I wonder why Pedro so liked it. It was the most + unhappy day when I invented it (well, by a strange reason + I believed that it is very clever :-)), + and when I managed to clean IPv4 of this crap, + it was really great win. + BTW I forgot how 2.0 route/arp works :-) :-) + --ANK + */ + out: if (*err) { RDBG(("dfree(%p) ", rt)); @@ -701,7 +841,17 @@ out: rt = NULL; } RDBG(("ret(%p)\n", rt)); +#if 0 return rt; +#else + /* BUGGG! For now always return NULL. (see above) + + Really, it was used only in two places, and one of them + (rt6_add_dflt_router) is repaired, ip6_fw is not essential + at all. --ANK + */ + return NULL; +#endif } int ip6_del_rt(struct rt6_info *rt) @@ -710,6 +860,12 @@ int ip6_del_rt(struct rt6_info *rt) start_bh_atomic(); + /* I'd add here couple of cli() + cli(); cli(); cli(); + + Now it is really LOCKED. :-) :-) --ANK + */ + rt6_dflt_pointer = NULL; if (atomic_read(&rt6_tbl_lock) == 1) @@ -723,30 +879,55 @@ int ip6_del_rt(struct rt6_info *rt) int ip6_route_del(struct in6_rtmsg *rtmsg) { + struct fib6_node *fn; struct rt6_info *rt; - struct device *dev=NULL; - /* - * Find device - */ - if(rtmsg->rtmsg_ifindex) { - dev=dev_get_by_index(rtmsg->rtmsg_ifindex); - if (dev == NULL) - return -ENODEV; - } - /* - * Find route - */ - rt=rt6_lookup(&rtmsg->rtmsg_dst, &rtmsg->rtmsg_src, dev, dev ? RTF_LINKRT : 0); + rt6_lock(); + fn = fib6_lookup(&ip6_routing_table, &rtmsg->rtmsg_dst, &rtmsg->rtmsg_src); + rt = fn->leaf; /* * Blow it away + * + * BUGGGG It will not help with Pedro's subtrees. + * We urgently need fib6_locate_node function, and + * it is not the only place where rt6_lookup is used + * for wrong purpose. + * --ANK */ - if(rt && rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len && - rt->rt6i_src.plen == rtmsg->rtmsg_src_len) { - ip6_del_rt(rt); - return 0; +restart: + if (rt && rt->rt6i_src.plen == rtmsg->rtmsg_src_len) { + if (rt->rt6i_dst.plen > rtmsg->rtmsg_dst_len) { + struct fib6_node *fn = rt->rt6i_node; + while ((fn = fn->parent) != NULL) { + if (fn->fn_flags & RTN_ROOT) + break; + if (fn->fn_flags & RTN_RTINFO) { + rt = fn->leaf; + goto restart; + } + } + } + + if (rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len) { + for ( ; rt; rt = rt->u.next) { + if (rtmsg->rtmsg_ifindex && + (rt->rt6i_dev == NULL || + rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex)) + continue; + if (rtmsg->rtmsg_flags&RTF_GATEWAY && + ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway)) + continue; + if (rtmsg->rtmsg_metric && + rtmsg->rtmsg_metric != rt->rt6i_metric) + continue; + ip6_del_rt(rt); + rt6_unlock(); + return 0; + } + } } + rt6_unlock(); return -ESRCH; } @@ -773,7 +954,7 @@ void __rt6_run_bh(void) rt6_bh_mask = 0; } -#ifdef CONFIG_NETLINK +#ifdef CONFIG_IPV6_NETLINK /* * NETLINK interface * routing socket moral equivalent @@ -785,6 +966,7 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb) struct in6_rtmsg *rtmsg; int err; + rtnl_lock(); while (skb->len) { if (skb->len < sizeof(struct in6_rtmsg)) { count = -EINVAL; @@ -809,10 +991,10 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb) } out: - kfree_skb(skb, FREE_READ); + rtnl_unlock(); + kfree_skb(skb); return count; } -#endif /* CONFIG_NETLINK */ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) { @@ -825,10 +1007,8 @@ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) memcpy(skb_put(skb, sizeof(struct in6_rtmsg)), &rtmsg, sizeof(struct in6_rtmsg)); -#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) -#endif - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src, @@ -867,11 +1047,10 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src, msg->rtmsg_flags = flags; -#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) -#endif - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } +#endif /* CONFIG_IPV6_NETLINK */ /* * Handle redirects @@ -888,6 +1067,12 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, if (rt == NULL || rt->u.dst.error) return NULL; + /* Redirect received -> path was valid. + Look, redirects are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + /* Duplicate redirect: silently ignore. */ if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0) return NULL; @@ -931,21 +1116,32 @@ source_ok: * We have finally decided to accept it. */ if (rt->rt6i_dst.plen == 128) { + /* BUGGGG! Very bad bug. Fast path code does not protect + * itself of changing nexthop on the fly, it was supposed + * that crucial parameters (dev, nexthop, hh) ARE VOLATILE. + * --ANK + * Not fixed!! I plugged it to avoid random crashes + * (they are very unlikely, but I do not want to shrug + * every time when redirect arrives) + * but the plug must be removed. --ANK + */ + +#if 0 /* * Already a host route. * */ if (rt->rt6i_nexthop) neigh_release(rt->rt6i_nexthop); - /* - * purge hh_cache - */ rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; if (on_link) rt->rt6i_flags &= ~RTF_GATEWAY; ipv6_addr_copy(&rt->rt6i_gateway, target); rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target); return rt; +#else + return NULL; +#endif } nrt = ip6_rt_copy(rt); @@ -959,12 +1155,15 @@ source_ok: ipv6_addr_copy(&nrt->rt6i_gateway, target); nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target); nrt->rt6i_dev = dev; - nrt->u.dst.pmtu = dev->mtu; + nrt->u.dst.pmtu = ipv6_get_mtu(dev); + if (!ipv6_addr_is_multicast(&nrt->rt6i_dst.addr)) + nrt->rt6i_hoplimit = ipv6_get_hoplimit(dev); rt6_lock(); rt6_ins(nrt); rt6_unlock(); + /* BUGGGGGGG! nrt can point to nowhere. */ return nrt; } @@ -975,7 +1174,7 @@ source_ok: void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) { - struct rt6_info *rt; + struct rt6_info *rt, *nrt; if (pmtu < 576 || pmtu > 65536) { #if RT6_DEBUG >= 1 @@ -994,13 +1193,21 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) return; } + if (pmtu >= rt->u.dst.pmtu) + return; + + /* New mtu received -> path was valid. + They are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + /* It is wrong, but I plugged the hole here. On-link routes are cloned differently, look at rt6_redirect --ANK */ - if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (!(rt->rt6i_flags&RTF_GATEWAY)) return; - } if (rt->rt6i_dst.plen == 128) { /* @@ -1012,11 +1219,18 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) return; } - rt = ip6_rt_copy(rt); - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); - rt->rt6i_dst.plen = 128; + nrt = ip6_rt_copy(rt); + ipv6_addr_copy(&nrt->rt6i_dst.addr, addr); + nrt->rt6i_dst.plen = 128; - rt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); + nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); + + /* It was missing. :-) :-) + I wonder, kernel was deemed to crash after pkt_too_big + and nobody noticed it. Hey, guys, do someone really + use it? --ANK + */ + nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); rt6_lock(); rt6_ins(rt); @@ -1027,7 +1241,7 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) * Misc support functions */ -struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) { struct rt6_info *rt; @@ -1038,8 +1252,9 @@ struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->u.dst.output = ort->u.dst.output; rt->u.dst.pmtu = ort->u.dst.pmtu; + rt->rt6i_hoplimit = ort->rt6i_hoplimit; rt->rt6i_dev = ort->rt6i_dev; - + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); rt->rt6i_keylen = ort->rt6i_keylen; rt->rt6i_flags = ort->rt6i_flags; @@ -1076,7 +1291,7 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct device *dev) for (rt = fn->leaf; rt; rt=rt->u.next) { if (dev == rt->rt6i_dev && - ipv6_addr_cmp(&rt->rt6i_dst.addr, addr) == 0) + ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0) break; } @@ -1117,6 +1332,10 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, rt = ip6_route_add(&rtmsg, &err); + /* BUGGGGGGGGGGGGGGGGGGGG! + rt can be not NULL, but point to heavens. + */ + if (err) { printk(KERN_DEBUG "rt6_add_dflt: ip6_route_add error %d\n", err); @@ -1172,6 +1391,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) if (err) return -EFAULT; + rtnl_lock(); switch (cmd) { case SIOCADDRT: ip6_route_add(&rtmsg, &err); @@ -1182,9 +1402,12 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) default: err = -EINVAL; }; + rtnl_unlock(); +#ifdef CONFIG_IPV6_NETLINK if (err == 0) rt6_sndrtmsg(&rtmsg); +#endif return err; }; @@ -1198,7 +1421,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) int ip6_pkt_discard(struct sk_buff *skb) { ipv6_statistics.Ip6OutNoRoutes++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -1229,15 +1452,20 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) if (rt == NULL) return -ENOMEM; - memset(rt, 0, sizeof(struct rt6_info)); - rt->u.dst.input = ip6_input; - rt->u.dst.output = dev_queue_xmit; + rt->u.dst.output = ip6_output; rt->rt6i_dev = dev_get("lo"); - rt->u.dst.pmtu = rt->rt6i_dev->mtu; + rt->u.dst.pmtu = ipv6_get_mtu(rt->rt6i_dev); + rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev); + rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + if (rt->rt6i_nexthop == NULL) { + dst_free((struct dst_entry *) rt); + return -ENOMEM; + } + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; @@ -1248,6 +1476,21 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) return 0; } +/* Delete address. Warning: you should check that this address + disappeared before calling this function. + */ + +int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev) +{ + struct rt6_info *rt; + + rt = rt6_lookup(addr, NULL, dev_get("lo"), RTF_LINKRT); + if (rt && rt->rt6i_dst.plen == 128) + return ip6_del_rt(rt); + + return 0; +} + #ifdef CONFIG_RT6_POLICY static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb) @@ -1355,18 +1598,299 @@ found: goto error; nrt->rt6i_flags |= RTF_CACHE; + /* BUGGGG! nrt can point to nowhere! */ rt6_ins(nrt); return nrt; } #endif +/* + * Nope, I am not idiot. I see that it is the ugliest of ugly routines. + * Anyone is advertised to write better one. --ANK + */ + +struct rt6_ifdown_arg { + struct device *dev; + struct rt6_info *rt; +}; + + +static void rt6_ifdown_node(struct fib6_node *fn, void *p_arg) +{ + struct rt6_info *rt; + struct rt6_ifdown_arg *arg = (struct rt6_ifdown_arg *) p_arg; + + if (arg->rt != NULL) + return; + + for (rt = fn->leaf; rt; rt = rt->u.next) { + if (rt->rt6i_dev == arg->dev || arg->dev == NULL) { + arg->rt = rt; + return; + } + } +} + +void rt6_ifdown(struct device *dev) +{ + int count = 0; + struct rt6_ifdown_arg arg; + struct rt6_info *rt; + + do { + arg.dev = dev; + arg.rt = NULL; + fib6_walk_tree(&ip6_routing_table, rt6_ifdown_node, &arg, + RT6_FILTER_RTNODES); + if (arg.rt != NULL) + ip6_del_rt(arg.rt); + count++; + } while (arg.rt != NULL); + + /* And default routes ... */ + + for (rt = ip6_routing_table.leaf; rt; ) { + if (rt != &ip6_null_entry && (rt->rt6i_dev == dev || dev == NULL)) { + struct rt6_info *deleting = rt; + rt = rt->u.next; + ip6_del_rt(deleting); + continue; + } + rt = rt->u.next; + } +} + +#ifdef CONFIG_RTNETLINK + +static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta, + struct in6_rtmsg *rtmsg) +{ + memset(rtmsg, 0, sizeof(*rtmsg)); + + rtmsg->rtmsg_dst_len = r->rtm_dst_len; + rtmsg->rtmsg_src_len = r->rtm_src_len; + rtmsg->rtmsg_flags = RTF_UP; + rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; + + if (rta[RTA_GATEWAY-1]) { + if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16); + rtmsg->rtmsg_flags |= RTF_GATEWAY; + } + if (rta[RTA_DST-1]) { + if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3)); + } + if (rta[RTA_SRC-1]) { + if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3)); + } + if (rta[RTA_OIF-1]) { + if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int))) + return -EINVAL; + memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + } + if (rta[RTA_PRIORITY-1]) { + if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + } + return 0; +} + +int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + return ip6_route_del(&rtmsg); +} + +int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + int err = 0; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + ip6_route_add(&rtmsg, &err); + return err; +} + + +struct rt6_rtnl_dump_arg +{ + struct sk_buff *skb; + struct netlink_callback *cb; + int skip; + int count; + int stop; +}; + +static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, + int type, pid_t pid, u32 seq) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; +#ifdef CONFIG_RTNL_OLD_IFINFO + unsigned char *o; +#else + struct rtattr *mx; +#endif + struct rta_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET6; + rtm->rtm_dst_len = rt->rt6i_dst.plen; + rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; +#ifdef CONFIG_RTNL_OLD_IFINFO + rtm->rtm_nhs = 0; +#endif + rtm->rtm_protocol = RTPROT_BOOT; + if (rt->rt6i_flags&RTF_DYNAMIC) + rtm->rtm_protocol = RTPROT_REDIRECT; + else if (rt->rt6i_flags&(RTF_ADDRCONF|RTF_ALLONLINK)) + rtm->rtm_protocol = RTPROT_KERNEL; + else if (rt->rt6i_flags&RTF_DEFAULT) + rtm->rtm_protocol = RTPROT_RA; + + if (rt->rt6i_flags&RTF_CACHE) + rtm->rtm_flags |= RTM_F_CLONED; + +#ifdef CONFIG_RTNL_OLD_IFINFO + o = skb->tail; +#endif + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); + if (rtm->rtm_src_len) + RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#ifdef CONFIG_RTNL_OLD_IFINFO + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); +#else + mx = (struct rtattr*)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + mx->rta_len = skb->tail - (u8*)mx; +#endif + if (rt->u.dst.neighbour) + RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex); + RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric); + ci.rta_lastuse = jiffies - rt->u.dst.lastuse; + if (rt->rt6i_expires) + ci.rta_expires = rt->rt6i_expires - jiffies; + else + ci.rta_expires = 0; + ci.rta_used = 0; + ci.rta_clntref = atomic_read(&rt->u.dst.use); + ci.rta_error = rt->u.dst.error; + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); +#ifdef CONFIG_RTNL_OLD_IFINFO + rtm->rtm_optlen = skb->tail - o; +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void rt6_dump_node(struct fib6_node *fn, void *p_arg) +{ + struct rt6_info *rt; + struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + + if (arg->stop) + return; + + for (rt = fn->leaf; rt; rt = rt->u.next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (rt6_fill_node(arg->skb, rt, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq) <= 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + + +int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rt6_rtnl_dump_arg arg; + + arg.skb = skb; + arg.cb = cb; + arg.skip = cb->args[0]; + arg.count = 0; + arg.stop = 0; + start_bh_atomic(); + fib6_walk_tree(&ip6_routing_table, rt6_dump_node, &arg, RT6_FILTER_RTNODES); + if (arg.stop == 0) + rt6_dump_node(&ip6_routing_table, &arg); + end_bh_atomic(); + cb->args[0] = arg.count; + return skb->len; +} + +void inet6_rt_notify(int event, struct rt6_info *rt) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); + return; + } + if (rt6_fill_node(skb, rt, event, 0, 0) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, GFP_ATOMIC); +} + +#endif + /* * /proc */ #ifdef CONFIG_PROC_FS + #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) struct rt6_proc_arg { @@ -1411,11 +1935,8 @@ static void rt6_info_node(struct fib6_node *fn, void *p_arg) if (rt->rt6i_nexthop) { for (i=0; i<16; i++) { - struct nd_neigh *ndn; - - ndn = (struct nd_neigh *) rt->rt6i_nexthop; sprintf(arg->buffer + arg->len, "%02x", - ndn->ndn_addr.s6_addr[i]); + rt->rt6i_nexthop->primary_key[i]); arg->len += 2; } } else { @@ -1424,7 +1945,7 @@ static void rt6_info_node(struct fib6_node *fn, void *p_arg) arg->len += 32; } arg->len += sprintf(arg->buffer + arg->len, - " %08lx %08x %08x %08lx %8s\n", + " %08x %08x %08x %08x %8s\n", rt->rt6i_metric, atomic_read(&rt->rt6i_use), atomic_read(&rt->rt6i_ref), rt->rt6i_flags, rt->rt6i_dev ? rt->rt6i_dev->name : ""); @@ -1528,6 +2049,7 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length, return arg.len; } + extern struct rt6_statistics rt6_stats; static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length, @@ -1558,28 +2080,73 @@ static struct proc_dir_entry proc_rt6_info = { 0, &proc_net_inode_operations, rt6_proc_info }; -static struct proc_dir_entry proc_rt6_stats = { - PROC_NET_RT6_STATS, 9, "rt6_stats", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - rt6_proc_stats -}; static struct proc_dir_entry proc_rt6_tree = { PROC_NET_RT6_TREE, 7, "ip6_fib", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, rt6_proc_tree }; +static struct proc_dir_entry proc_rt6_stats = { + PROC_NET_RT6_STATS, 9, "rt6_stats", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt6_proc_stats +}; #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SYSCTL + +static int flush_delay; + +static +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp); + if (flush_delay < 0) + flush_delay = 0; + start_bh_atomic(); + fib6_run_gc((unsigned long)flush_delay); + end_bh_atomic(); + return 0; + } else + return -EINVAL; +} + +ctl_table ipv6_route_table[] = { + {NET_IPV6_ROUTE_FLUSH, "flush", + &flush_delay, sizeof(int), 0644, NULL, + &ipv6_sysctl_rtcache_flush}, + {NET_IPV6_ROUTE_GC_THRESH, "gc_thresh", + &ip6_dst_ops.gc_thresh, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV6_ROUTE_MAX_SIZE, "max_size", + &ip6_rt_max_size, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval", + &ip6_rt_gc_min_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout", + &ip6_rt_gc_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval", + &ip6_rt_gc_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {0} +}; + +#endif + + __initfunc(void ip6_route_init(void)) { #ifdef CONFIG_PROC_FS proc_net_register(&proc_rt6_info); - proc_net_register(&proc_rt6_stats); proc_net_register(&proc_rt6_tree); + proc_net_register(&proc_rt6_stats); #endif -#ifdef CONFIG_NETLINK +#ifdef CONFIG_IPV6_NETLINK netlink_attach(NETLINK_ROUTE6, rt6_msgrcv); #endif } @@ -1592,11 +2159,9 @@ void ip6_route_cleanup(void) proc_net_unregister(PROC_NET_RT6_TREE); proc_net_unregister(PROC_NET_RT6_STATS); #endif -#ifdef CONFIG_NETLINK +#ifdef CONFIG_IPV6_NETLINK netlink_detach(NETLINK_ROUTE6); #endif -#if 0 - fib6_flush(); -#endif + rt6_ifdown(NULL); } #endif /* MODULE */ diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4ff6e28d8..f029942df 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.23 1997/11/08 18:15:49 kuznet Exp $ + * $Id: sit.c,v 1.24 1997/12/13 21:53:17 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,7 +14,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #define __NO_VERSION__ #include <linux/module.h> #include <linux/errno.h> @@ -330,7 +329,7 @@ void ipip6_err(struct sk_buff *skb, unsigned char *dp, int len) icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); } } - kfree_skb(skb2, FREE_WRITE); + kfree_skb(skb2); return; #endif } @@ -359,7 +358,7 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len) } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -393,17 +392,17 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) goto tx_error; if (!dst) { - struct nd_neigh *neigh = NULL; + struct neighbour *neigh = NULL; if (skb->dst) - neigh = (struct nd_neigh *) skb->dst->neighbour; + neigh = skb->dst->neighbour; if (neigh == NULL) { printk(KERN_DEBUG "sit: nexthop == NULL\n"); goto tx_error; } - addr6 = &neigh->ndn_addr; + addr6 = (struct in6_addr*)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -455,7 +454,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (tunnel->err_count > 0) { if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { tunnel->err_count--; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + dst_link_failure(skb); } else tunnel->err_count = 0; } @@ -472,11 +471,11 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) if (!new_skb) { ip_rt_put(rt); stats->tx_dropped++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); skb = new_skb; } @@ -517,10 +516,10 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) return 0; tx_error_icmp: - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); + dst_link_failure(skb); tx_error: stats->tx_errors++; - dev_kfree_skb(skb, FREE_WRITE); + dev_kfree_skb(skb); tunnel->recursion--; return 0; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 212bcbc3e..6fbc022e1 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -11,109 +11,12 @@ #include <net/ipv6.h> #include <net/addrconf.h> -struct ipv6_config ipv6_config = -{ - 0, /* forwarding */ - IPV6_DEFAULT_HOPLIMIT, /* hop limit */ - 1, /* accept RAs */ - 1, /* accept redirects */ - - 3, /* nd_max_mcast_solicit */ - 3, /* nd_max_ucast_solicit */ - RETRANS_TIMER, /* nd_retrans_time */ - RECHABLE_TIME, /* nd_base_reach_time */ - (5 * HZ), /* nd_delay_probe_time */ - - 1, /* autoconfiguration */ - 1, /* dad transmits */ - MAX_RTR_SOLICITATIONS, /* router solicits */ - RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */ - MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */ - - 60*HZ, /* rt cache timeout */ - 30*HZ, /* rt gc period */ -}; +extern ctl_table ipv6_route_table[]; #ifdef CONFIG_SYSCTL -int ipv6_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, - void *buffer, size_t *lenp) -{ - int val = ipv6_config.forwarding; - int retv; - - retv = proc_dointvec(ctl, write, filp, buffer, lenp); - - if (write) { - if (ipv6_config.forwarding && val == 0) { - printk(KERN_DEBUG "sysctl: IPv6 forwarding enabled\n"); - ndisc_forwarding_on(); - addrconf_forwarding_on(); - } - - if (ipv6_config.forwarding == 0 && val) - ndisc_forwarding_off(); - } - return retv; -} - ctl_table ipv6_table[] = { - {NET_IPV6_FORWARDING, "forwarding", - &ipv6_config.forwarding, sizeof(int), 0644, NULL, - &ipv6_sysctl_forwarding}, - - {NET_IPV6_HOPLIMIT, "hop_limit", - &ipv6_config.hop_limit, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ACCEPT_RA, "accept_ra", - &ipv6_config.accept_ra, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects", - &ipv6_config.accept_redirects, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ND_MAX_MCAST_SOLICIT, "nd_max_mcast_solicit", - &ipv6_config.nd_max_mcast_solicit, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ND_MAX_UCAST_SOLICIT, "nd_max_ucast_solicit", - &ipv6_config.nd_max_ucast_solicit, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ND_RETRANS_TIME, "nd_retrans_time", - &ipv6_config.nd_retrans_time, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ND_REACHABLE_TIME, "nd_base_reachble_time", - &ipv6_config.nd_base_reachable_time, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_ND_DELAY_PROBE_TIME, "nd_delay_first_probe_time", - &ipv6_config.nd_delay_probe_time, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_AUTOCONF, "autoconf", - &ipv6_config.autoconf, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_DAD_TRANSMITS, "dad_transmits", - &ipv6_config.dad_transmits, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_RTR_SOLICITS, "router_solicitations", - &ipv6_config.rtr_solicits, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval", - &ipv6_config.rtr_solicit_interval, sizeof(int), 0644, NULL, - &proc_dointvec}, - - {NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay", - &ipv6_config.rtr_solicit_delay, sizeof(int), 0644, NULL, - &proc_dointvec}, - + {NET_IPV6_ROUTE, "route", NULL, 0, 0555, ipv6_route_table}, {0} }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b6559565b..f7a080a0d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.43 1997/10/30 23:52:34 davem Exp $ + * $Id: tcp_ipv6.c,v 1.44 1997/12/13 21:53:18 kuznet Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -577,9 +577,10 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { /* icmp should have updated the destination cache entry */ - dst_check(&np->dst, np->dst_cookie); + if (sk->dst_cache) + dst_check(&sk->dst_cache, np->dst_cookie); - if (np->dst == NULL) { + if (sk->dst_cache == NULL) { struct flowi fl; struct dst_entry *dst; @@ -595,10 +596,10 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, ip6_dst_store(sk, dst); } - if (np->dst->error) - sk->err_soft = np->dst->error; + if (sk->dst_cache->error) + sk->err_soft = sk->dst_cache->error; else - sk->mtu = np->dst->pmtu; + sk->mtu = sk->dst_cache->pmtu; if (sk->sock_readers) { /* remove later */ printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n"); @@ -684,7 +685,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) dst = ip6_route_output(sk, &fl); if (dst->error) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); dst_release(dst); return; } @@ -1062,8 +1063,8 @@ static void tcp_v6_send_reset(struct sk_buff *skb) buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->daddr; - fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->saddr; + fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; + fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, fl.nl_u.ip6_u.daddr, @@ -1072,8 +1073,8 @@ static void tcp_v6_send_reset(struct sk_buff *skb) fl.proto = IPPROTO_TCP; fl.dev = skb->dev; - fl.uli_u.ports.dport = th->dest; - fl.uli_u.ports.sport = th->source; + fl.uli_u.ports.dport = t1->dest; + fl.uli_u.ports.sport = t1->source; ip6_xmit(NULL, buff, &fl, NULL); tcp_statistics.TcpOutSegs++; @@ -1197,22 +1198,6 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, return(0); } - /* - * Signal NDISC that the connection is making - * "forward progress" - * This is in the fast path and should be _really_ speed up! -Ak - */ - if (sk->state != TCP_LISTEN) { - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - if (after(skb->seq, tp->rcv_nxt) || - after(skb->ack_seq, tp->snd_una)) { - if (np->dst) - ndisc_validate(np->dst->neighbour); - } - } - skb_set_owner_r(skb, sk); if (sk->state == TCP_ESTABLISHED) { @@ -1262,7 +1247,7 @@ discard_it: * Discard frame */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -1270,10 +1255,10 @@ static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - if (np->dst) - dst_check(&np->dst, np->dst_cookie); + if (sk->dst_cache) + dst_check(&sk->dst_cache, np->dst_cookie); - if (np->dst == NULL) { + if (sk->dst_cache == NULL) { struct flowi fl; struct dst_entry *dst; @@ -1288,7 +1273,7 @@ static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb) ip6_dst_store(sk, dst); } - if (np->dst->error) { + if (sk->dst_cache->error) { /* * lost route to destination */ @@ -1457,7 +1442,6 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { - struct ipv6_pinfo * np = &sk->net_pinfo.af_inet6; struct sk_buff *skb; tcp_clear_xmit_timers(sk); @@ -1470,21 +1454,21 @@ static int tcp_v6_destroy_sock(struct sock *sk) */ while((skb = skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); /* * Cleans up our, hopefuly empty, out_of_order_queue */ while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); /* * Release destination entry */ - if (np->dst) - dst_release(np->dst); + dst_release(sk->dst_cache); + sk->dst_cache = NULL; return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index aed22f964..b99dc19e3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.18 1997/09/14 08:32:24 davem Exp $ + * $Id: udp.c,v 1.21 1997/12/29 19:52:52 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -27,6 +27,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> +#include <asm/uaccess.h> #include <net/sock.h> #include <net/snmp.h> @@ -282,16 +283,11 @@ int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) static void udpv6_close(struct sock *sk, unsigned long timeout) { - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - lock_sock(sk); sk->state = TCP_CLOSE; - - if (np->dst) - dst_release(np->dst); - ipv6_sock_mc_close(sk); udp_v6_unhash(sk); + sk->dead = 1; release_sock(sk); destroy_sock(sk); } @@ -304,10 +300,8 @@ static void udpv6_close(struct sock *sk, unsigned long timeout) int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, int noblock, int flags, int *addr_len) { - int copied = 0; - int truesize; struct sk_buff *skb; - int err; + int copied, err; /* * Check any passed addresses @@ -322,16 +316,13 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, */ skb = skb_recv_datagram(sk, flags, noblock, &err); - if(skb==NULL) - return err; + if (!skb) + goto out; - truesize=ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr); - - copied=truesize; - - if(copied>len) { - copied=len; - msg->msg_flags|=MSG_TRUNC; + copied = ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; } /* @@ -341,7 +332,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); if (err) - return err; + goto out_free; sk->stamp=skb->stamp; @@ -350,7 +341,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *) msg->msg_name; - sin6->sin6_family = AF_INET6; sin6->sin6_port = skb->h.uh->source; @@ -365,9 +355,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, datagram_recv_ctl(sk, msg, skb); } } - - skb_free_datagram(sk, skb); - return(copied); + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; } void udpv6_err(int type, int code, unsigned char *buff, __u32 info, @@ -406,14 +399,14 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ipv6_statistics.Ip6InDiscards++; ipv6_statistics.Ip6InDelivers--; skb->sk = NULL; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } udp_stats_in6.UdpInDatagrams++; return 0; } -static int __inline__ inet6_mc_check(struct sock *sk, struct in6_addr *addr) +static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr) { struct ipv6_mc_socklist *mc; @@ -461,6 +454,7 @@ static void udpv6_mcast_deliver(struct udphdr *uh, { struct sock *sk, *sk2; + SOCKHASH_LOCK(); sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr); if(sk) { @@ -469,16 +463,17 @@ static void udpv6_mcast_deliver(struct udphdr *uh, uh->dest, saddr, uh->source, daddr))) { struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); - if(sock_queue_rcv_skb(sk, buff) < 0) { + if (buff && sock_queue_rcv_skb(sk2, buff) < 0) { buff->sk = NULL; - kfree_skb(buff, FREE_READ); + kfree_skb(buff); } } } if(!sk || sock_queue_rcv_skb(sk, skb) < 0) { skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } + SOCKHASH_UNLOCK(); } int udpv6_rcv(struct sk_buff *skb, struct device *dev, @@ -504,7 +499,7 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev, if (ulen > len || len < sizeof(*uh)) { printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len); udp_stats_in6.UdpInErrors++; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return(0); } @@ -547,7 +542,7 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev, icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return(0); } @@ -562,7 +557,7 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev, discard: udp_stats_in6.UdpInErrors++; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return(0); } @@ -649,6 +644,16 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) int err; + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + + When jumbo header will be implemeted we will change it + to something sort of (len will be size_t) + ulen > SIZE_T_MAX - sizeof(struct udphdr) + */ + if (ulen < 0 || ulen > 0xFFFF - sizeof(struct udphdr)) + return -EMSGSIZE; + if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT)) return(-EINVAL); @@ -665,9 +670,12 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) udh.uh.dest = sin6->sin6_port; daddr = &sin6->sin6_addr; - if (np->dst && ipv6_addr_cmp(daddr, &np->daddr)) { - dst_release(np->dst); - np->dst = NULL; + /* BUGGGG! If route is not cloned, this check always + fails, hence dst_cache only slows down transmission --ANK + */ + if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) { + dst_release(sk->dst_cache); + sk->dst_cache = NULL; } } else { if (sk->state != TCP_ESTABLISHED) diff --git a/net/ipx/.cvsignore b/net/ipx/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/ipx/.cvsignore +++ b/net/ipx/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/ipx/Config.in b/net/ipx/Config.in new file mode 100644 index 000000000..d35afbac0 --- /dev/null +++ b/net/ipx/Config.in @@ -0,0 +1,6 @@ +# +# IPX configuration +# + +comment 'IPX options' +bool 'Full internal IPX network' CONFIG_IPX_INTERN diff --git a/net/ipx/Makefile b/net/ipx/Makefile index 0c29dc5d3..b9d337a8a 100644 --- a/net/ipx/Makefile +++ b/net/ipx/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux IPX layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -7,12 +7,14 @@ # # Note 2! The CFLAGS definition is now in the main makefile... +# We only get in/to here if CONFIG_IPX = 'y' or 'm' + O_TARGET := ipx.o -O_OBJS := af_ipx.o M_OBJS := $(O_TARGET) +OX_OBJS += af_ipx.o ifeq ($(CONFIG_SYSCTL),y) -O_OBJS += sysctl_net_ipx.o + O_OBJS += sysctl_net_ipx.o endif include $(TOPDIR)/Rules.make diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 2a46c5270..cf56df492 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -210,11 +210,10 @@ static void ipx_destroy_socket(struct sock *sk) ipx_remove_socket(sk); while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); } sk_free(sk); - MOD_DEC_USE_COUNT; } /* The following code is used to support IPX Interfaces (IPXITF). An @@ -378,11 +377,7 @@ static int ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) if((retval = sock_queue_rcv_skb(sock, skb))<0) { - /* - * skb->sk is NULL here, so FREE_WRITE does not hurt - * the sending socket. - */ - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); } return retval; } @@ -415,14 +410,8 @@ static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int c if (copy != 0) { skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1 != NULL) - { - skb1->arp = 1; - } - else - { + if (skb1 == NULL) return -ENOMEM; - } } else { @@ -445,10 +434,9 @@ static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int c if (copy == 0) { /* skb was solely for us, and we did not make a copy, - * so free it. FREE_WRITE does not hurt, because - * skb->sk is NULL here. + * so free it. */ - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } return 0; } @@ -500,7 +488,7 @@ static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int c if (sock1 == NULL && sock2 == NULL) { if (!copy) - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); return 0; } @@ -515,8 +503,6 @@ static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int c if (copy) { skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - skb1->arp=1; } else { @@ -533,8 +519,6 @@ static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int c if (sock1 && sock2) { skb2 = skb_clone(skb1, GFP_ATOMIC); - if (skb2 != NULL) - skb2->arp = 1; } else skb2 = skb1; @@ -561,7 +545,6 @@ static struct sk_buff *ipxitf_adjust_skbuff(ipx_interface *intrfc, struct sk_buf /* Hopefully, most cases */ if (in_offset >= out_offset) { - skb->arp = 1; return skb; } @@ -572,11 +555,10 @@ static struct sk_buff *ipxitf_adjust_skbuff(ipx_interface *intrfc, struct sk_buf skb_reserve(skb2,out_offset); skb2->nh.raw= skb2->h.raw=skb_put(skb2,skb->len); - skb2->arp=1; memcpy(skb2->h.raw, skb->h.raw, skb->len); } - kfree_skb(skb, FREE_WRITE); - return skb2; + kfree_skb(skb); + return NULL; } static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) @@ -648,15 +630,7 @@ static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) if (!send_to_wire) { - /* - * We do a FREE_WRITE here because this indicates how - * to treat the socket with which the packet is - * associated. If this packet is associated with a - * socket at all, it must be the originator of the - * packet. Routed packets will have no socket associated - * with them. - */ - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); return 0; } @@ -707,7 +681,6 @@ static int ipxitf_add_local_route(ipx_interface *intrfc) static const char * ipx_frame_name(unsigned short); static const char * ipx_device_name(ipx_interface *); -static int ipxrtr_route_skb(struct sk_buff *); static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) { @@ -720,7 +693,7 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) if (call_in_firewall(PF_IPX, skb->dev, ipx, NULL, &skb)!=FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -813,20 +786,20 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) */ if (call_fw_firewall(PF_IPX, skb->dev, ipx, NULL, &skb)!=FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } /* We only route point-to-point packets. */ if (skb->pkt_type == PACKET_HOST) { - skb=skb_unshare(skb, GFP_ATOMIC, FREE_READ); + skb=skb_unshare(skb, GFP_ATOMIC); if(skb) return ipxrtr_route_skb(skb); else return 0; } - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } @@ -838,7 +811,7 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) } /* we couldn't pawn it off so unload it */ - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } @@ -1025,7 +998,8 @@ static int ipxitf_delete(ipx_interface_definition *idef) return -EPROTONOSUPPORT; dev=dev_get(idef->ipx_device); - if(dev==NULL) return -ENODEV; + if (dev==NULL) + return -ENODEV; intrfc = ipxitf_find_using_phys(dev, dlink_type); if (intrfc != NULL) { @@ -1134,9 +1108,9 @@ static int ipxitf_ioctl_real(unsigned int cmd, void *arg) sipx->sipx_family=AF_IPX; sipx->sipx_network=ipxif->if_netnum; memcpy(sipx->sipx_node, ipxif->if_node, sizeof(sipx->sipx_node)); - err = copy_to_user(arg,&ifr,sizeof(ifr)); - if (err) - return -EFAULT; + err = -EFAULT; + if (!copy_to_user(arg, &ifr, sizeof(ifr))) + err = 0; return err; } case SIOCAIPXITFCRT: @@ -1360,7 +1334,6 @@ static int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, stru return err; skb_reserve(skb,ipx_offset); - skb->arp=1; skb->sk=sk; /* Fill in IPX header */ @@ -1394,7 +1367,7 @@ static int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, stru err = memcpy_fromiovec(skb_put(skb,len),iov,len); if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EFAULT; } @@ -1409,7 +1382,7 @@ static int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, stru if(call_out_firewall(PF_IPX, skb->dev, ipx, NULL, &skb)!=FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EPERM; } @@ -1417,7 +1390,7 @@ static int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, stru rt->ir_router_node : ipx->ipx_dest.node); } -static int ipxrtr_route_skb(struct sk_buff *skb) +int ipxrtr_route_skb(struct sk_buff *skb) { struct ipxhdr *ipx = skb->nh.ipxh; ipx_route *r; @@ -1427,7 +1400,7 @@ static int ipxrtr_route_skb(struct sk_buff *skb) if (r == NULL) { /* no known route */ - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } i = r->ir_intrfc; @@ -1746,8 +1719,11 @@ static int ipx_create(struct socket *sock, int protocol) switch(sock->type) { case SOCK_DGRAM: - sock->ops = &ipx_dgram_ops; - break; + sock->ops = &ipx_dgram_ops; + break; + case SOCK_STREAM: /* Allow higher levels to piggyback */ + case SOCK_SEQPACKET: + printk(KERN_CRIT "IPX: _create-ing non_DGRAM socket\n"); default: sk_free(sk); return(-ESOCKTNOSUPPORT); @@ -1770,6 +1746,9 @@ static int ipx_release(struct socket *sock, struct socket *peer) sk->dead=1; sock->sk=NULL; ipx_destroy_socket(sk); + if ( sock->type == SOCK_DGRAM ) { + MOD_DEC_USE_COUNT; + } return(0); } @@ -1845,7 +1824,9 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk->protinfo.af_ipx.node, sk->protinfo.af_ipx.port) != NULL) { - SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", (int)addr->sipx_port); + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); return -EADDRINUSE; } } @@ -1860,7 +1841,9 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) IPX_NODE_LEN); if(ipxitf_find_socket(intrfc, addr->sipx_port)!=NULL) { - SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", (int)addr->sipx_port); + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); return -EADDRINUSE; } } @@ -1871,7 +1854,8 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) an interface routed to IPX with the ipx routing ioctl() */ if(ipxitf_find_socket(intrfc, addr->sipx_port)!=NULL) { - SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", (int)addr->sipx_port); + SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); return -EADDRINUSE; } @@ -1879,7 +1863,8 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ipxitf_insert_socket(intrfc, sk); sk->zapped=0; - SOCK_DEBUG(sk, "IPX: socket is bound.\n"); + SOCK_DEBUG(sk, "IPX: bound socket 0x%04X.\n", ntohs(addr->sipx_port) ); + return 0; } @@ -1920,8 +1905,10 @@ static int ipx_connect(struct socket *sock, struct sockaddr *uaddr, memcpy(sk->protinfo.af_ipx.dest_addr.node, addr->sipx_node,IPX_NODE_LEN); sk->protinfo.af_ipx.type=addr->sipx_type; - sock->state = SS_CONNECTED; - sk->state=TCP_ESTABLISHED; + if(sock->type == SOCK_DGRAM ) { + sock->state = SS_CONNECTED; + sk->state=TCP_ESTABLISHED; + } return 0; } @@ -2052,7 +2039,7 @@ int ipx_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) /* Too small? */ if(ntohs(ipx->ipx_pktsize)<sizeof(struct ipxhdr)) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } @@ -2060,7 +2047,7 @@ int ipx_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { if(ipx_set_checksum(ipx, ntohs(ipx->ipx_pktsize))!=ipx->ipx_checksum) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } } @@ -2077,7 +2064,7 @@ int ipx_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) if (intrfc == NULL) { /* Not one of ours */ - kfree_skb(skb,FREE_READ); + kfree_skb(skb); return 0; } } @@ -2148,32 +2135,28 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, int size, struct sock *sk=sock->sk; struct sockaddr_ipx *sipx=(struct sockaddr_ipx *)msg->msg_name; struct ipxhdr *ipx = NULL; - int copied = 0; - int truesize; struct sk_buff *skb; - int err; + int copied, err; if (sk->zapped) return -ENOTCONN; skb=skb_recv_datagram(sk,flags&~MSG_DONTWAIT,flags&MSG_DONTWAIT,&err); - if(skb==NULL) - return err; + if (!skb) + goto out; ipx = skb->nh.ipxh; - truesize=ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); - - copied = truesize; + copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); if(copied > size) { copied=size; msg->msg_flags|=MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb,sizeof(struct ipxhdr),msg->msg_iov,copied); - + err = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, + copied); if (err) - return err; + goto out_free; msg->msg_namelen = sizeof(*sipx); @@ -2185,9 +2168,12 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, int size, sipx->sipx_network=ipx->ipx_source.net; sipx->sipx_type = ipx->ipx_type; } - skb_free_datagram(sk, skb); + err = copied; - return(copied); +out_free: + skb_free_datagram(sk, skb); +out: + return err; } /* @@ -2242,11 +2228,12 @@ static int ipx_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) { if(sk->stamp.tv_sec==0) return -ENOENT; - ret = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); - if (ret) - ret = -EFAULT; + ret = -EFAULT; + if (!copy_to_user((void *)arg, &sk->stamp, + sizeof(struct timeval))) + ret = 0; } - return 0; + return ret; } case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: @@ -2372,6 +2359,19 @@ ipx_proto_init(struct net_proto *pro) printk(KERN_INFO "IPX Portions Copyright (c) 1995 Caldera, Inc.\n"); } +/* Higher layers need this info to prep tx pkts */ +int ipx_if_offset(unsigned long ipx_net_number) +{ + ipx_route *rt = NULL; + + rt = ipxrtr_lookup(ipx_net_number); + return ( rt ? rt->ir_intrfc->if_ipx_offset : -ENETUNREACH ); +} + +/* Export symbols for higher layers */ +EXPORT_SYMBOL(ipxrtr_route_skb); +EXPORT_SYMBOL(ipx_if_offset); + #ifdef MODULE /* Note on MOD_{INC,DEC}_USE_COUNT: * @@ -2426,8 +2426,6 @@ __initfunc(static void ipx_proto_finito(void)) return; } -EXPORT_NO_SYMBOLS; - int init_module(void) { ipx_proto_init(NULL); diff --git a/net/lapb/.cvsignore b/net/lapb/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/lapb/.cvsignore +++ b/net/lapb/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c index 126b93673..4e7a9ca4d 100644 --- a/net/lapb/lapb_in.c +++ b/net/lapb/lapb_in.c @@ -114,7 +114,7 @@ static void lapb_state0_machine(lapb_cb *lapb, struct sk_buff *skb, struct lapb_ break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -206,7 +206,7 @@ static void lapb_state1_machine(lapb_cb *lapb, struct sk_buff *skb, struct lapb_ break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -278,7 +278,7 @@ static void lapb_state2_machine(lapb_cb *lapb, struct sk_buff *skb, struct lapb_ break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -523,7 +523,7 @@ static void lapb_state3_machine(lapb_cb *lapb, struct sk_buff *skb, struct lapb_ } if (!queued) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -595,7 +595,7 @@ static void lapb_state4_machine(lapb_cb *lapb, struct sk_buff *skb, struct lapb_ break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c index 9e1cdf475..8c8b21c10 100644 --- a/net/lapb/lapb_out.c +++ b/net/lapb/lapb_out.c @@ -158,7 +158,7 @@ void lapb_transmit_buffer(lapb_cb *lapb, struct sk_buff *skb, int type) #endif if (!lapb_data_transmit(lapb, skb)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } void lapb_establish_data_link(lapb_cb *lapb) diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c index 3f7f0a84e..611eba6f1 100644 --- a/net/lapb/lapb_subr.c +++ b/net/lapb/lapb_subr.c @@ -43,10 +43,10 @@ void lapb_clear_queues(lapb_cb *lapb) struct sk_buff *skb; while ((skb = skb_dequeue(&lapb->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } /* @@ -67,7 +67,7 @@ void lapb_frames_acked(lapb_cb *lapb, unsigned short nr) if (lapb->va != nr) { while (skb_peek(&lapb->ack_queue) != NULL && lapb->va != nr) { skb = skb_dequeue(&lapb->ack_queue); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); lapb->va = (lapb->va + 1) % modulus; } } diff --git a/net/netbeui/af_netbeui.c b/net/netbeui/af_netbeui.c index 9b1444997..85bd8f4d1 100644 --- a/net/netbeui/af_netbeui.c +++ b/net/netbeui/af_netbeui.c @@ -414,7 +414,7 @@ static int netbeui_sendmsg(struct socket *sock, struct msghdr *msg, int len, int err = memcpy_fromiovec(skb_put(skb,len),msg->msg_iov,len); if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EFAULT; } @@ -422,14 +422,14 @@ static int netbeui_sendmsg(struct socket *sock, struct msghdr *msg, int len, int if(call_out_firewall(AF_NETBEUI, skb->dev, nbp, NULL)!=FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EPERM; } #endif if(nb_send_low(dev,skb,&usat->sat_addr, NULL)==-1) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); SOCK_DEBUG(sk, "SK %p: Done write (%d).\n", sk, len); return len; } diff --git a/net/netbeui/netbeui_llc.c b/net/netbeui/netbeui_llc.c index 198fe1ce1..29edc5acf 100644 --- a/net/netbeui/netbeui_llc.c +++ b/net/netbeui/netbeui_llc.c @@ -163,7 +163,7 @@ static void netbeui_event(llcptr llc) /* We ignore TST, XID, FRMR stuff */ /* FIXME: We need to free frames here once I fix the callback! */ if(llc->inc_skb) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* diff --git a/net/netbeui/netbeui_name.c b/net/netbeui/netbeui_name.c index d47fddd1b..c5a579597 100644 --- a/net/netbeui/netbeui_name.c +++ b/net/netbeui/netbeui_name.c @@ -58,7 +58,9 @@ static void nb_defend(struct device *dev, const char *name) if(nskb==NULL) return; /* Build a name defence packet */ - dev_queue_xmit(nskb,dev,SOPRI_INTERACTIVE); + nskb->dev = dev; + nskb->priority = TC_PRIO_CONTROL; + dev_queue_xmit(nskb); } void netbeui_heard_name(struct device *dev, struct sk_buff *skb) @@ -83,7 +85,7 @@ void netbeui_heard_name(struct device *dev, struct sk_buff *skb) nb_complete(nb,skb); } } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -137,7 +139,7 @@ void netbeui_name_defence(struct dev *dev, struct sk_buff *skb) } } } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } void netbeui_name_query(struct dev *dev, struct sk_buff *skb) @@ -151,9 +153,11 @@ void netbeui_name_query(struct dev *dev, struct sk_buff *skb) if(nskb!=NULL) { /* Build a name reply packet */ - dev_queue_xmit(nskb,dev,SOPRI_INTERACTIVE); + nskb->dev = dev; + nskb->priority = TC_PRIO_CONTROL; + dev_queue_xmit(nskb); } } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } diff --git a/net/netlink/.cvsignore b/net/netlink/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/netlink/.cvsignore +++ b/net/netlink/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 81c53edda..3f02f4c3c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -359,6 +359,7 @@ retry: #ifdef NL_EMULATE_DEV if (sk->protinfo.af_netlink.handler) { + skb_orphan(skb); len = sk->protinfo.af_netlink.handler(protocol, skb); netlink_unlock(sk); return len; @@ -370,7 +371,7 @@ retry: if (nonblock) { sti(); netlink_unlock(sk); - kfree_skb(skb, 0); + kfree_skb(skb); return -EAGAIN; } interruptible_sleep_on(sk->sleep); @@ -378,7 +379,7 @@ retry: sti(); if (signal_pending(current)) { - kfree_skb(skb, 0); + kfree_skb(skb); return -ERESTARTSYS; } goto retry; @@ -392,7 +393,7 @@ Nprintk("unicast_deliver %d\n", skb->len); netlink_unlock(sk); return len; } - kfree_skb(skb, 0); + kfree_skb(skb); return -ECONNREFUSED; } @@ -400,6 +401,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff { #ifdef NL_EMULATE_DEV if (sk->protinfo.af_netlink.handler) { + skb_orphan(skb); sk->protinfo.af_netlink.handler(sk->protocol, skb); return 0; } else @@ -466,8 +468,8 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid, netlink_unlock_table(protocol, allocation == GFP_KERNEL); if (skb2) - kfree_skb(skb2, 0); - kfree_skb(skb, 0); + kfree_skb(skb2); + kfree_skb(skb); } void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code) @@ -630,7 +632,7 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) static void netlink_destroy_callback(struct netlink_callback *cb) { if (cb->skb) - kfree_skb(cb->skb, 0); + kfree_skb(cb->skb); kfree(cb); } @@ -758,16 +760,13 @@ void netlink_detach(int unit) int netlink_post(int unit, struct sk_buff *skb) { if (netlink_kernel[unit]) { + memset(skb->cb, 0, sizeof(skb->cb)); netlink_broadcast(netlink_kernel[unit]->sk, skb, 0, ~0, GFP_ATOMIC); return 0; } return -EUNATCH;; } -EXPORT_SYMBOL(netlink_attach); -EXPORT_SYMBOL(netlink_detach); -EXPORT_SYMBOL(netlink_post); - #endif #if 0 diff --git a/net/netlink/netlink_dev.c b/net/netlink/netlink_dev.c index cbd48c1c0..2a1dd160c 100644 --- a/net/netlink/netlink_dev.c +++ b/net/netlink/netlink_dev.c @@ -42,7 +42,7 @@ static unsigned int netlink_poll(struct file *file, poll_table * wait) if (sock->ops->poll==NULL) return 0; - return sock->ops->poll(sock, wait); + return sock->ops->poll(file, sock, wait); } /* diff --git a/net/netrom/.cvsignore b/net/netrom/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/netrom/.cvsignore +++ b/net/netrom/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 8b51f7120..a84d1fd53 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -300,7 +300,7 @@ void nr_destroy_socket(struct sock *sk) /* Not static as it's used by the timer skb->sk->protinfo.nr->state = NR_STATE_0; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } if (atomic_read(&sk->wmem_alloc) != 0 || atomic_read(&sk->rmem_alloc) != 0) { @@ -763,7 +763,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) /* Now attach up the new socket */ skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); sk->ack_backlog--; newsock->sk = newsk; @@ -999,7 +999,7 @@ static int nr_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct s SOCK_DEBUG(sk, "NET/ROM: Transmitting buffer\n"); if (sk->state != TCP_ESTABLISHED) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOTCONN; } diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 380ec8ecc..b04adbcaa 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -121,7 +121,10 @@ static int nr_rebuild_header(struct sk_buff *skb) unsigned char *bp = skb->data; if (arp_find(bp + 7, skb)) { - kfree_skb(skb, FREE_WRITE); +#if 0 + /* BUGGGG! If arp_find returned 1, skb does not exist. --ANK*/ + kfree_skb(skb); +#endif return 1; } @@ -135,17 +138,17 @@ static int nr_rebuild_header(struct sk_buff *skb) bp[6] |= AX25_SSSID_SPARE; if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } if (skb->sk != NULL) skb_set_owner_w(skbn, skb->sk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (!nr_route_frame(skbn, NULL)) { - kfree_skb(skbn, FREE_WRITE); + kfree_skb(skbn); stats->tx_errors++; } @@ -216,7 +219,7 @@ static int nr_xmit(struct sk_buff *skb, struct device *dev) sti(); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); stats->tx_errors++; diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index a0d3148c2..ac32cd704 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -74,7 +74,7 @@ static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) while ((skbo = skb_dequeue(&sk->protinfo.nr->frag_queue)) != NULL) { memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); } sk->protinfo.nr->fraglen = 0; @@ -246,7 +246,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype } else if (nr_in_rx_window(sk, ns)) { skb_queue_tail(&temp_queue, skbn); } else { - kfree_skb(skbn, FREE_READ); + kfree_skb(skbn); } } while ((skbn = skb_dequeue(&temp_queue)) != NULL) { diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 4c3eb61d8..93da60adb 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -79,7 +79,7 @@ void nr_output(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } else { skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ } @@ -216,7 +216,7 @@ void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) *dptr++ = sysctl_netrom_network_ttl_initialiser; if (!nr_route_frame(skb, NULL)) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); nr_disconnect(sk, ENETUNREACH); } } diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index d31141876..7ae69fe07 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -47,16 +47,16 @@ void nr_clear_queues(struct sock *sk) struct sk_buff *skb; while ((skb = skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.nr->ack_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.nr->reseq_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.nr->frag_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } /* @@ -74,7 +74,7 @@ void nr_frames_acked(struct sock *sk, unsigned short nr) if (sk->protinfo.nr->va != nr) { while (skb_peek(&sk->protinfo.nr->ack_queue) != NULL && sk->protinfo.nr->va != nr) { skb = skb_dequeue(&sk->protinfo.nr->ack_queue); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); sk->protinfo.nr->va = (sk->protinfo.nr->va + 1) % NR_MODULUS; } } @@ -266,7 +266,7 @@ void nr_transmit_dm(struct sk_buff *skb) *dptr++ = 0; if (!nr_route_frame(skbn, NULL)) - kfree_skb(skbn, FREE_WRITE); + kfree_skb(skbn); } void nr_disconnect(struct sock *sk, int reason) diff --git a/net/netsyms.c b/net/netsyms.c index dfc3c9db1..b7809863b 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -32,6 +32,7 @@ #include <net/pkt_sched.h> #include <linux/inet.h> #include <linux/mroute.h> +#include <linux/igmp.h> extern struct net_proto_family inet_family_ops; @@ -117,24 +118,45 @@ EXPORT_SYMBOL(skb_realloc_headroom); EXPORT_SYMBOL(datagram_poll); EXPORT_SYMBOL(put_cmsg); EXPORT_SYMBOL(net_families); +EXPORT_SYMBOL(sock_kmalloc); +EXPORT_SYMBOL(sock_kfree_s); + +#ifdef CONFIG_FILTER +EXPORT_SYMBOL(sk_run_filter); +#endif EXPORT_SYMBOL(neigh_table_init); -/* Declared in <net/neighbour.h> but not defined? - EXPORT_SYMBOL(neigh_table_destroy); - EXPORT_SYMBOL(neigh_table_run_bh); -*/ -EXPORT_SYMBOL(neigh_alloc); -EXPORT_SYMBOL(neigh_table_ins); -EXPORT_SYMBOL(neigh_queue_ins); -EXPORT_SYMBOL(neigh_unlink); -EXPORT_SYMBOL(neigh_lookup); -EXPORT_SYMBOL(ntbl_walk_table); -EXPORT_SYMBOL(neigh_tbl_run_bh); +EXPORT_SYMBOL(neigh_table_clear); +EXPORT_SYMBOL(__neigh_lookup); +EXPORT_SYMBOL(neigh_resolve_output); +EXPORT_SYMBOL(neigh_connected_output); +EXPORT_SYMBOL(neigh_update); +EXPORT_SYMBOL(__neigh_event_send); +EXPORT_SYMBOL(neigh_event_ns); +EXPORT_SYMBOL(neigh_ifdown); +#ifdef CONFIG_ARPD +EXPORT_SYMBOL(neigh_app_ns); +#endif +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(neigh_sysctl_register); +#endif +EXPORT_SYMBOL(pneigh_lookup); +EXPORT_SYMBOL(pneigh_enqueue); +EXPORT_SYMBOL(neigh_destroy); +EXPORT_SYMBOL(neigh_parms_alloc); +EXPORT_SYMBOL(neigh_parms_release); +EXPORT_SYMBOL(neigh_rand_reach_time); /* dst_entry */ EXPORT_SYMBOL(dst_alloc); EXPORT_SYMBOL(__dst_free); EXPORT_SYMBOL(dst_total); +EXPORT_SYMBOL(dst_destroy); + +/* misc. support routines */ +EXPORT_SYMBOL(net_ratelimit); +EXPORT_SYMBOL(net_random); +EXPORT_SYMBOL(net_srandom); /* Needed by smbfs.o */ EXPORT_SYMBOL(__scm_destroy); @@ -176,18 +198,25 @@ EXPORT_SYMBOL(ip_route_output); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(ip_options_compile); EXPORT_SYMBOL(arp_send); +#ifdef CONFIG_SHAPER_MODULE +EXPORT_SYMBOL(arp_broken_ops); +#endif EXPORT_SYMBOL(ip_id_count); EXPORT_SYMBOL(ip_send_check); EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(inet_family_ops); EXPORT_SYMBOL(in_aton); -EXPORT_SYMBOL(in_ntoa); -EXPORT_SYMBOL(net_ratelimit); +EXPORT_SYMBOL(ip_mc_inc_group); +EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(__ip_finish_output); +EXPORT_SYMBOL(inet_dgram_ops); + +/* needed for ip_gre -cw */ +EXPORT_SYMBOL(ip_statistics); #ifdef CONFIG_IPV6_MODULE /* inet functions common to v4 and v6 */ EXPORT_SYMBOL(inet_stream_ops); -EXPORT_SYMBOL(inet_dgram_ops); EXPORT_SYMBOL(inet_release); EXPORT_SYMBOL(inet_stream_connect); EXPORT_SYMBOL(inet_dgram_connect); @@ -263,17 +292,37 @@ EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(xrlim_allow); #endif +#ifdef CONFIG_NETLINK +EXPORT_SYMBOL(netlink_set_err); +EXPORT_SYMBOL(netlink_broadcast); +EXPORT_SYMBOL(netlink_unicast); +EXPORT_SYMBOL(netlink_kernel_create); +EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(netlink_ack); +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) +EXPORT_SYMBOL(netlink_attach); +EXPORT_SYMBOL(netlink_detach); +EXPORT_SYMBOL(netlink_post); +#endif +#endif + +#ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(rtnetlink_links); +EXPORT_SYMBOL(__rta_fill); +EXPORT_SYMBOL(rtnetlink_dump_ifinfo); +EXPORT_SYMBOL(rtnl_wlockct); +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(neigh_delete); +EXPORT_SYMBOL(neigh_add); +EXPORT_SYMBOL(neigh_dump_info); +#endif + #ifdef CONFIG_PACKET_MODULE EXPORT_SYMBOL(dev_set_allmulti); EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_mc_delete); EXPORT_SYMBOL(sklist_remove_socket); EXPORT_SYMBOL(rtnl_wait); EXPORT_SYMBOL(rtnl_rlockct); -#ifdef CONFIG_RTNETLINK -EXPORT_SYMBOL(rtnl); -EXPORT_SYMBOL(rtnl_wlockct); -#endif #endif #if defined(CONFIG_IPV6_MODULE) || defined(CONFIG_PACKET_MODULE) @@ -333,7 +382,6 @@ EXPORT_SYMBOL(alloc_skb); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(skb_clone); EXPORT_SYMBOL(skb_copy); -EXPORT_SYMBOL(dev_alloc_skb); EXPORT_SYMBOL(netif_rx); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_remove_pack); @@ -342,6 +390,15 @@ EXPORT_SYMBOL(dev_alloc); EXPORT_SYMBOL(dev_alloc_name); EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(netdev_dropping); +#ifdef CONFIG_NET_FASTROUTE +EXPORT_SYMBOL(dev_fastroute_stat); +#endif +#ifdef CONFIG_NET_HW_FLOWCONTROL +EXPORT_SYMBOL(netdev_register_fc); +EXPORT_SYMBOL(netdev_unregister_fc); +EXPORT_SYMBOL(netdev_fc_xoff); +#endif #ifdef CONFIG_IP_ACCT EXPORT_SYMBOL(ip_acct_output); #endif @@ -349,12 +406,12 @@ EXPORT_SYMBOL(dev_base); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_mc_add); EXPORT_SYMBOL(arp_find); -EXPORT_SYMBOL(arp_find_1); EXPORT_SYMBOL(n_tty_ioctl); EXPORT_SYMBOL(tty_register_ldisc); EXPORT_SYMBOL(kill_fasync); EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); +EXPORT_SYMBOL(dev_mc_delete); EXPORT_SYMBOL(rtnl_lock); EXPORT_SYMBOL(rtnl_unlock); @@ -380,4 +437,6 @@ EXPORT_SYMBOL(register_qdisc); EXPORT_SYMBOL(unregister_qdisc); EXPORT_SYMBOL(noop_qdisc); +EXPORT_SYMBOL(register_gifconf); + #endif /* CONFIG_NET */ diff --git a/net/packet/.cvsignore b/net/packet/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/packet/.cvsignore +++ b/net/packet/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ff7fef131..a098f59b9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -66,8 +66,16 @@ #include <linux/module.h> #include <linux/init.h> -#if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE) -#include <linux/if_frad.h> +#ifdef CONFIG_INET +#include <net/inet_common.h> +#endif + +#ifdef CONFIG_BRIDGE +#include <net/br.h> +#endif + +#ifdef CONFIG_DLCI +extern int dlci_ioctl(unsigned int, void*); #endif /* @@ -211,6 +219,11 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct device *dev, struct pack * so that this procedure is noop. */ + if (skb->pkt_type == PACKET_LOOPBACK) { + kfree_skb(skb); + return 0; + } + skb_push(skb, skb->data-skb->mac.raw); /* @@ -228,7 +241,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct device *dev, struct pack if (sock_queue_rcv_skb(sk,skb)<0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -318,16 +331,14 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, * notable one here. This should really be fixed at the driver level. */ skb_reserve(skb,(dev->hard_header_len+15)&~15); - skb->mac.raw = skb->nh.raw = skb->data; + skb->nh.raw = skb->data; /* Try to align data part correctly */ if (dev->hard_header) { skb->data -= dev->hard_header_len; skb->tail -= dev->hard_header_len; - skb->mac.raw = skb->data; } err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - skb->arp = 1; /* No ARP needs doing on this (complete) frame */ skb->protocol = proto; skb->dev = dev; skb->priority = sk->priority; @@ -351,7 +362,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } @@ -372,9 +383,10 @@ static int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_ty sk = (struct sock *) pt->data; - /* - * The SOCK_PACKET socket receives _all_ frames. - */ + if (skb->pkt_type == PACKET_LOOPBACK) { + kfree_skb(skb); + return 0; + } skb->dev = dev; @@ -411,7 +423,7 @@ static int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_ty if (sock_queue_rcv_skb(sk,skb)<0) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } return(0); @@ -469,18 +481,17 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, } skb_reserve(skb, (dev->hard_header_len+15)&~15); - skb->mac.raw = skb->nh.raw = skb->data; + skb->nh.raw = skb->data; if (dev->hard_header) { if (dev->hard_header(skb, dev, ntohs(proto), saddr ? saddr->sll_addr : NULL, NULL, len) < 0 && sock->type == SOCK_DGRAM) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); dev_unlock_list(); return -EINVAL; } - skb->mac.raw = skb->data; if (sock->type != SOCK_DGRAM) { skb->tail = skb->data; skb->len = 0; @@ -488,7 +499,6 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, } err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - skb->arp = 1; /* No ARP needs doing on this (complete) frame */ skb->protocol = proto; skb->dev = dev; skb->priority = sk->priority; @@ -506,7 +516,7 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, } if (err) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } @@ -575,7 +585,7 @@ static int packet_release(struct socket *sock, struct socket *peersock) /* Purge queues */ while ((skb=skb_dequeue(&sk->receive_queue))!=NULL) - kfree_skb(skb,FREE_READ); + kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) { sk->timer.data=(unsigned long)sk; @@ -768,9 +778,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, int flags, struct scm_cookie *scm) { struct sock *sk = sock->sk; - int copied=0; struct sk_buff *skb; - int err; + int copied, err; #if 0 /* What error should we return now? EUNATTACH? */ @@ -806,7 +815,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, */ if(skb==NULL) - return err; + goto out; /* * You lose any data beyond the buffer you gave. If it worries a @@ -814,7 +823,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, */ copied = skb->len; - if(copied>len) + if (copied > len) { copied=len; msg->msg_flags|=MSG_TRUNC; @@ -823,9 +832,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, /* We can't use skb_copy_datagram here */ err = memcpy_toiovec(msg->msg_iov, skb->data, copied); if (err) - { - return -EFAULT; - } + goto out_free; sk->stamp=skb->stamp; @@ -833,13 +840,15 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, memcpy(msg->msg_name, skb->cb, msg->msg_namelen); /* - * Free or return the buffer as appropriate. Again this hides all the - * races and re-entrancy issues from us. + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. */ + err = copied; +out_free: skb_free_datagram(sk, skb); - - return(copied); +out: + return err; } #ifdef CONFIG_SOCK_PACKET @@ -1107,7 +1116,9 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg err = -EFAULT; return err; case SIOCGIFFLAGS: +#ifndef CONFIG_INET case SIOCSIFFLAGS: +#endif case SIOCGIFCONF: case SIOCGIFMETRIC: case SIOCSIFMETRIC: @@ -1136,24 +1147,29 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg return -ENOPKG; #endif +#ifdef CONFIG_INET + case SIOCADDRT: + case SIOCDELRT: + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + case SIOCDRARP: + case SIOCGRARP: + case SIOCSRARP: + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: case SIOCADDDLCI: case SIOCDELDLCI: -#ifdef CONFIG_DLCI - return(dlci_ioctl(cmd, (void *) arg)); -#endif - -#ifdef CONFIG_DLCI_MODULE - -#ifdef CONFIG_KERNELD - if (dlci_ioctl_hook == NULL) - request_module("dlci"); + return inet_dgram_ops.ioctl(sock, cmd, arg); #endif - if (dlci_ioctl_hook) - return((*dlci_ioctl_hook)(cmd, (void *) arg)); -#endif - return -ENOPKG; - default: if ((cmd >= SIOCDEVPRIVATE) && (cmd <= (SIOCDEVPRIVATE + 15))) diff --git a/net/rose/.cvsignore b/net/rose/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/rose/.cvsignore +++ b/net/rose/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5ae64334d..eeb396350 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -378,7 +378,7 @@ void rose_destroy_socket(struct sock *sk) /* Not static as it's used by the time skb->sk->protinfo.rose->state = ROSE_STATE_0; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } if (atomic_read(&sk->wmem_alloc) != 0 || atomic_read(&sk->rmem_alloc) != 0) { @@ -851,7 +851,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags) /* Now attach up the new socket */ skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); sk->ack_backlog--; newsock->sk = newsk; @@ -1064,7 +1064,7 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, SOCK_DEBUG(sk, "ROSE: Transmitting buffer\n"); if (sk->state != TCP_ESTABLISHED) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOTCONN; } diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index bc2097cda..0cc81c464 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -102,22 +102,25 @@ static int rose_rebuild_header(struct sk_buff *skb) struct sk_buff *skbn; if (arp_find(bp + 7, skb)) { - kfree_skb(skb, FREE_WRITE); +#if 0 + /* BUGGGG! If arp_find returned 1, skb does not exist. --ANK*/ + kfree_skb(skb); +#endif return 1; } if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } if (skb->sk != NULL) skb_set_owner_w(skbn, skb->sk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (!rose_route_frame(skbn, NULL)) { - kfree_skb(skbn, FREE_WRITE); + kfree_skb(skbn); stats->tx_errors++; } @@ -188,7 +191,7 @@ static int rose_xmit(struct sk_buff *skb, struct device *dev) sti(); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); stats->tx_errors++; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index 8ee27147a..c462fa696 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -169,7 +169,7 @@ void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigne if (neigh->restarted) { while ((skbn = skb_dequeue(&neigh->queue)) != NULL) if (!rose_send_frame(skbn, neigh)) - kfree_skb(skbn, FREE_WRITE); + kfree_skb(skbn); } } @@ -199,7 +199,7 @@ void rose_transmit_restart_request(struct rose_neigh *neigh) *dptr++ = 0; if (!rose_send_frame(skb, neigh)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } /* @@ -226,7 +226,7 @@ void rose_transmit_restart_confirmation(struct rose_neigh *neigh) *dptr++ = ROSE_RESTART_CONFIRMATION; if (!rose_send_frame(skb, neigh)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } /* @@ -254,7 +254,7 @@ void rose_transmit_diagnostic(struct rose_neigh *neigh, unsigned char diag) *dptr++ = diag; if (!rose_send_frame(skb, neigh)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } /* @@ -284,7 +284,7 @@ void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, uns *dptr++ = diagnostic; if (!rose_send_frame(skb, neigh)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) @@ -292,7 +292,7 @@ void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) unsigned char *dptr; if (call_fw_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } @@ -304,7 +304,7 @@ void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) if (neigh->restarted) { if (!rose_send_frame(skb, neigh)) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } else { skb_queue_tail(&neigh->queue, skb); diff --git a/net/rose/rose_out b/net/rose/rose_out deleted file mode 100644 index 745cb5a2b..000000000 --- a/net/rose/rose_out +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Rose release 001 - * - * This is ALPHA test software. This code may break your machine, randomly fail to work with new - * releases, misbehave and/or generally screw up. It might even work. - * - * This code REQUIRES 2.1.0 or higher/ NET3.029 - * - * This module: - * This module is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * History - * Rose 001 Jonathan(G4KLX) Cloned from nr_out.c - */ - -#include <linux/config.h> -#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <net/ax25.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <asm/segment.h> -#include <asm/system.h> -#include <linux/fcntl.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <net/rose.h> - -/* - * This is where all Rose frames pass; - */ -void rose_output(struct sock *sk, struct sk_buff *skb) -{ - struct sk_buff *skbn; - unsigned char header[ROSE_MIN_LEN]; - int err, frontlen, len; - - if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { - /* Save a copy of the Header */ - memcpy(header, skb->data, ROSE_MIN_LEN); - skb_pull(skb, ROSE_MIN_LEN); - - frontlen = skb_headroom(skb); - - while (skb->len > 0) { - if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, 0, &err)) == NULL) - return; - - skbn->sk = sk; - skbn->free = 1; - skbn->arp = 1; - - skb_reserve(skbn, frontlen); - - len = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; - - /* Copy the user data */ - memcpy(skb_put(skbn, len), skb->data, len); - skb_pull(skb, len); - - /* Duplicate the Header */ - skb_push(skbn, ROSE_MIN_LEN); - memcpy(skbn->data, header, ROSE_MIN_LEN); - - if (skb->len > 0) - skbn->data[2] |= M_BIT; - - skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ - } - - skb->free = 1; - kfree_skb(skb, FREE_WRITE); - } else { - skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ - } - - if (sk->protinfo.rose->state == ROSE_STATE_3) - rose_kick(sk); -} - -/* - * This procedure is passed a buffer descriptor for an iframe. It builds - * the rest of the control part of the frame and then writes it out. - */ -static void rose_send_iframe(struct sock *sk, struct sk_buff *skb, int last) -{ - if (skb == NULL) - return; - - if (last) - skb->data[0] |= D_BIT; - - skb->data[2] |= (sk->protinfo.rose->vr << 5) & 0xE0; - skb->data[2] |= (sk->protinfo.rose->vs << 1) & 0x0E; - - rose_transmit_buffer(sk, skb); -} - -void rose_send_nak_frame(struct sock *sk) -{ - struct sk_buff *skb, *skbn; - - if ((skb = skb_peek(&sk->protinfo.rose->ack_queue)) == NULL) - return; - - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) - return; - - skbn->data[2] = sk->protinfo.rose->va; - skbn->data[3] = sk->protinfo.rose->vr; - - if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) - skbn->data[4] |= NR_CHOKE_FLAG; - - rose_transmit_buffer(sk, skbn); - - sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; - sk->protinfo.rose->vl = sk->protinfo.rose->vr; - sk->protinfo.rose->t1timer = 0; -} - -void rose_kick(struct sock *sk) -{ - struct sk_buff *skb, *skbn; - int last = 1; - unsigned short start, end, next; - - del_timer(&sk->timer); - - start = (skb_peek(&sk->protinfo.rose->ack_queue) == NULL) ? sk->protinfo.rose->va : sk->protinfo.rose->vs; - end = (sk->protinfo.rose->va + sk->window) % ROSE_MODULUS; - - if (!(sk->protinfo.rose->condition & PEER_RX_BUSY_CONDITION) && - start != end && - skb_peek(&sk->write_queue) != NULL) { - - sk->protinfo.rose->vs = start; - - /* - * Transmit data until either we're out of data to send or - * the window is full. - */ - - /* - * Dequeue the frame and copy it. - */ - skb = skb_dequeue(&sk->write_queue); - - do { - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { - skb_queue_head(&sk->write_queue, skb); - break; - } - - next = (sk->protinfo.rose->vs + 1) % ROSE_MODULUS; - last = (next == end); - - /* - * Transmit the frame copy. - */ - rose_send_iframe(sk, skbn, last); - - sk->protinfo.rose->vs = next; - - /* - * Requeue the original data frame. - */ - skb_queue_tail(&sk->protinfo.rose->ack_queue, skb); - - } while (!last && (skb = skb_dequeue(&sk->write_queue)) != NULL); - - sk->protinfo.rose->vl = sk->protinfo.rose->vr; - sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; - } - - rose_set_timer(sk); -} - -void rose_transmit_buffer(struct sock *sk, struct sk_buff *skb) -{ - unsigned char *dptr; - - dptr = skb_push(skb, 1); - *dptr = AX25_P_ROSE; - - skb->arp = 1; - - if (!ax25_send_frame(skb, (ax25_address *)sk->protinfo.rose->neighbour->dev->dev_addr, &sk->protinfo.rose->neighbour->callsign, sk->protinfo.rose->neighbour->digipeat, sk->protinfo.rose->neighbour->dev)) { - kfree_skb(skb, FREE_WRITE); - - sk->state = TCP_CLOSE; - sk->err = ENETUNREACH; - if (!sk->dead) - sk->state_change(sk); - sk->dead = 1; - } -} - -/* - * The following routines are taken from page 170 of the 7th ARRL Computer - * Networking Conference paper, as is the whole state machine. - */ - -void rose_establish_data_link(struct sock *sk) -{ - sk->protinfo.rose->condition = 0x00; - - rose_write_internal(sk, ROSE_CALL_REQUEST); - - sk->protinfo.rose->t1timer = sk->protinfo.rose->t1; -} - -/* - * Never send a NAK when we are CHOKEd. - */ -void rose_enquiry_response(struct sock *sk) -{ - int frametype = NR_INFOACK; - - if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) - frametype |= NR_CHOKE_FLAG; - - rose_write_internal(sk, frametype); - - sk->protinfo.rose->vl = sk->protinfo.rose->vr; - sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; -} - -void rose_check_iframes_acked(struct sock *sk, unsigned short nr) -{ - if (sk->protinfo.rose->vs == nr) { - rose_frames_acked(sk, nr); - } else { - if (sk->protinfo.rose->va != nr) { - rose_frames_acked(sk, nr); - } - } -} - -#endif diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index d9145cdea..917846bf7 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -184,7 +184,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) rose_stop_t0timer(rose_neigh); while ((skb = skb_dequeue(&rose_neigh->queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); save_flags(flags); cli(); @@ -534,7 +534,7 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) rose_start_ftimer(rose_neigh); while ((skb = skb_dequeue(&rose_neigh->queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); rose_route = rose_route_list; diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index e7709726c..d80212261 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -47,10 +47,10 @@ void rose_clear_queues(struct sock *sk) struct sk_buff *skb; while ((skb = skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } /* @@ -68,7 +68,7 @@ void rose_frames_acked(struct sock *sk, unsigned short nr) if (sk->protinfo.rose->va != nr) { while (skb_peek(&sk->protinfo.rose->ack_queue) != NULL && sk->protinfo.rose->va != nr) { skb = skb_dequeue(&sk->protinfo.rose->ack_queue); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); sk->protinfo.rose->va = (sk->protinfo.rose->va + 1) % ROSE_MODULUS; } } @@ -206,7 +206,7 @@ void rose_write_internal(struct sock *sk, int frametype) default: printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } diff --git a/net/sched/.cvsignore b/net/sched/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/sched/.cvsignore +++ b/net/sched/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/sched/Config.in b/net/sched/Config.in new file mode 100644 index 000000000..d1287a781 --- /dev/null +++ b/net/sched/Config.in @@ -0,0 +1,11 @@ +# +# Traffic control configuration. +# +tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ +tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ +#tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ +tristate 'RED queueing discipline' CONFIG_NET_SCH_RED +tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ +tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF +tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO +tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c index dbc05d31b..5e10ac097 100644 --- a/net/sched/sch_csz.c +++ b/net/sched/sch_csz.c @@ -459,7 +459,7 @@ csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) this = &q->flow[flow_id]; if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -711,12 +711,12 @@ csz_reset(struct Qdisc* sch) for (i=0; i<4; i++) while ((skb=skb_dequeue(&q->other[i])) != NULL) - kfree_skb(skb, 0); + kfree_skb(skb); for (i=0; i<CSZ_MAX_GUARANTEED; i++) { struct csz_flow *this = q->flow + i; while ((skb = skb_dequeue(&this->q)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); this->snext = this->sprev = this->fnext = this->fprev = (struct csz_head*)this; this->start = this->finish = 0; diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 8134baf16..af44d4e75 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -47,7 +47,7 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) q->qbytes += skb->len; return 0; } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } @@ -71,7 +71,7 @@ bfifo_reset(struct Qdisc* sch) while((skb=skb_dequeue(&sch->q)) != NULL) { q->qbytes -= skb->len; - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); } if (q->qbytes) { printk("fifo_reset: qbytes=%d\n", q->qbytes); @@ -88,7 +88,7 @@ pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) skb_queue_tail(&sch->q, skb); return 0; } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 1; } @@ -104,7 +104,7 @@ pfifo_reset(struct Qdisc* sch) struct sk_buff *skb; while((skb=skb_dequeue(&sch->q))!=NULL) - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 83aa8d10e..c3399f9c1 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -34,6 +34,9 @@ struct Qdisc_head qdisc_head = { &qdisc_head }; static struct Qdisc_ops *qdisc_base = NULL; +static int default_requeue(struct sk_buff *skb, struct Qdisc* qdisc); + + /* NOTES. Every discipline has two major routines: enqueue and dequeue. @@ -75,6 +78,8 @@ int unregister_qdisc(struct Qdisc_ops *qops) break; if (!q) return -ENOENT; + if (q->requeue == NULL) + q->requeue = default_requeue; *qp = q->next; return 0; } @@ -93,7 +98,7 @@ struct Qdisc *qdisc_lookup(int handle) static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -118,6 +123,7 @@ struct Qdisc noqueue_qdisc = }; + /* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */ static int @@ -129,11 +135,11 @@ pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; if (list->qlen <= skb->dev->tx_queue_len) { - skb_queue_tail(list, skb); + __skb_queue_tail(list, skb); return 1; } qdisc->dropped++; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -145,13 +151,25 @@ pfifo_fast_dequeue(struct Qdisc* qdisc) struct sk_buff *skb; for (prio = 0; prio < 3; prio++, list++) { - skb = skb_dequeue(list); + skb = __skb_dequeue(list); if (skb) return skb; } return NULL; } +static int +pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + + __skb_queue_head(list, skb); + return 1; +} + static void pfifo_fast_reset(struct Qdisc* qdisc) { @@ -185,9 +203,20 @@ static struct Qdisc_ops pfifo_fast_ops = pfifo_fast_dequeue, pfifo_fast_reset, NULL, - pfifo_fast_init + pfifo_fast_init, + NULL, + pfifo_fast_requeue }; +static int +default_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return 0; +} + static struct Qdisc * qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) { @@ -200,7 +229,6 @@ qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) memset(sch, 0, size); skb_queue_head_init(&sch->q); - skb_queue_head_init(&sch->failure_q); sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; @@ -218,7 +246,6 @@ void qdisc_reset(struct Qdisc *qdisc) start_bh_atomic(); if (ops->reset) ops->reset(qdisc); - skb_queue_purge(&qdisc->failure_q); end_bh_atomic(); } } @@ -232,7 +259,6 @@ void qdisc_destroy(struct Qdisc *qdisc) ops->reset(qdisc); if (ops->destroy) ops->destroy(qdisc); - skb_queue_purge(&qdisc->failure_q); ops->refcnt--; end_bh_atomic(); kfree(qdisc); @@ -373,23 +399,22 @@ int qdisc_restart(struct device *dev) struct Qdisc *q = dev->qdisc; struct sk_buff *skb; - skb = skb_dequeue(&q->failure_q); - if (!skb) { - skb = q->dequeue(q); - if (netdev_nit && skb) - dev_queue_xmit_nit(skb,dev); - } - if (skb) { + if ((skb = q->dequeue(q)) != NULL) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + if (dev->hard_start_xmit(skb, dev) == 0) { q->tx_last = jiffies; return -1; } -#if 0 - if (net_ratelimit()) - printk(KERN_DEBUG "netdevice %s defers output.\n", dev->name); -#endif - skb_queue_head(&q->failure_q, skb); - return -1; + + if (q->ops) { + q->ops->requeue(skb, q); + return -1; + } + + printk(KERN_DEBUG "%s: it is impossible!!!\n", dev->name); + kfree_skb(skb); } return q->q.qlen; } @@ -511,9 +536,6 @@ __initfunc(int pktsched_init(void)) register_qdisc(&##name##_ops); \ } - skb_queue_head_init(&noop_qdisc.failure_q); - skb_queue_head_init(&noqueue_qdisc.failure_q); - register_qdisc(&pfifo_fast_ops); #ifdef CONFIG_NET_SCH_CBQ INIT_QDISC(cbq); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index fd3ee43ac..637288d99 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -189,7 +189,7 @@ enqueue: return 1; } drop: - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } if (q->qave >= q->qth_max) { @@ -231,7 +231,7 @@ red_reset(struct Qdisc* sch) while((skb=skb_dequeue(&sch->q))!=NULL) { q->qbytes -= skb->len; - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); } if (q->qbytes) { printk("red_reset: qbytes=%lu\n", q->qbytes); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 65c3906b4..7a90df655 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -12,7 +12,6 @@ #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -174,7 +173,7 @@ static __inline__ void sfq_drop(struct sfq_sched_data *q) sfq_index x = q->dep[d].next; skb = q->qs[x].prev; __skb_unlink(skb, &q->qs[x]); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); sfq_dec(q, x); /* sch->q.qlen--; @@ -189,7 +188,7 @@ static __inline__ void sfq_drop(struct sfq_sched_data *q) q->allot[q->next[d]] += q->quantum; skb = q->qs[d].prev; __skb_unlink(skb, &q->qs[d]); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); sfq_dec(q, d); /* sch->q.qlen--; @@ -271,7 +270,7 @@ sfq_reset(struct Qdisc* sch) struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 9869af1d3..b4f141761 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -118,7 +118,7 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) __skb_unlink(skb, &sch->q); q->bytes -= skb->len; - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } @@ -187,7 +187,7 @@ tbf_reset(struct Qdisc* sch) struct sk_buff *skb; while ((skb = __skb_dequeue(&sch->q)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); q->bytes = 0; PSCHED_GET_TIME(q->t_c); q->tokens = q->depth; diff --git a/net/socket.c b/net/socket.c index 697a06cd3..5c9534031 100644 --- a/net/socket.c +++ b/net/socket.c @@ -11,7 +11,7 @@ * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes - * Alan Cox : Removed DDI + * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. @@ -152,7 +152,7 @@ static int sockets_in_use = 0; the AF_UNIX size (see net/unix/af_unix.c :unix_mkname()). */ - + int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr) { if(ulen<0||ulen>MAX_SOCK_ADDR) @@ -184,7 +184,7 @@ int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen) * "fromlen shall refer to the value before truncation.." * 1003.1g */ - return __put_user(klen, ulen); + return __put_user(klen, ulen); } /* @@ -221,7 +221,7 @@ static int get_fd(struct inode *inode) */ inode->i_count++; - current->files->fd[fd] = file; + fd_install(fd, file); file->f_op = &socket_file_ops; file->f_mode = 3; file->f_flags = O_RDWR; @@ -239,10 +239,11 @@ extern __inline__ struct socket *socki_lookup(struct inode *inode) * Go from a file number to its socket slot. */ -extern __inline__ struct socket *sockfd_lookup(int fd, int *err) +extern struct socket *sockfd_lookup(int fd, int *err) { struct file *file; struct inode *inode; + struct socket *sock; if (!(file = fget(fd))) { @@ -251,14 +252,18 @@ extern __inline__ struct socket *sockfd_lookup(int fd, int *err) } inode = file->f_dentry->d_inode; - if (!inode || !inode->i_sock || !socki_lookup(inode)) + if (!inode || !inode->i_sock || !(sock = socki_lookup(inode))) { *err = -ENOTSOCK; fput(file); return NULL; } - return socki_lookup(inode); + if (sock->file != file) { + printk(KERN_ERR "socki_lookup: socket file changed!\n"); + sock->file = file; + } + return sock; } extern __inline__ void sockfd_put(struct socket *sock) @@ -301,14 +306,15 @@ struct socket *sock_alloc(void) void sock_release(struct socket *sock) { - int oldstate; - - if ((oldstate = sock->state) != SS_UNCONNECTED) + if (sock->state != SS_UNCONNECTED) sock->state = SS_DISCONNECTING; if (sock->ops) sock->ops->release(sock, NULL); + if (sock->fasync_list) + printk(KERN_ERR "sock_release: fasync list not empty!\n"); + --sockets_in_use; /* Bookkeeping.. */ sock->file=NULL; iput(sock->inode); @@ -320,13 +326,10 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size) struct scm_cookie scm; err = scm_send(sock, msg, &scm); - if (err < 0) - return err; - - err = sock->ops->sendmsg(sock, msg, size, &scm); - - scm_destroy(&scm); - + if (err >= 0) { + err = sock->ops->sendmsg(sock, msg, size, &scm); + scm_destroy(&scm); + } return err; } @@ -337,11 +340,8 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) memset(&scm, 0, sizeof(scm)); size = sock->ops->recvmsg(sock, msg, size, flags, &scm); - - if (size < 0) - return size; - - scm_recv(sock, msg, &scm, flags); + if (size >= 0) + scm_recv(sock, msg, &scm, flags); return size; } @@ -453,7 +453,7 @@ int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = socki_lookup(inode); - return sock->ops->ioctl(sock, cmd, arg); + return sock->ops->ioctl(sock, cmd, arg); } @@ -467,7 +467,7 @@ static unsigned int sock_poll(struct file *file, poll_table * wait) * We can't return errors to poll, so it's either yes or no. */ - return sock->ops->poll(sock, wait); + return sock->ops->poll(file, sock, wait); } @@ -491,7 +491,7 @@ int sock_close(struct inode *inode, struct file *filp) /* * Update the socket async list */ - + static int sock_fasync(struct file *filp, int on) { struct fasync_struct *fa, *fna=NULL, **prev; @@ -571,12 +571,12 @@ int sock_create(int family, int type, int protocol, struct socket **res) int i; struct socket *sock; - /* - * Check protocol is in range - */ - if(family<0||family>=NPROTO) + /* + * Check protocol is in range + */ + if(family<0||family>=NPROTO) return -EINVAL; - + #if defined(CONFIG_KERNELD) && defined(CONFIG_NET) /* Attempt to load a protocol module if the find failed. * @@ -593,14 +593,14 @@ int sock_create(int family, int type, int protocol, struct socket **res) #endif if (net_families[family]==NULL) - return -EINVAL; + return -EINVAL; /* * Check that this is a type that we know how to manipulate and * the protocol makes sense here. The family can still reject the * protocol later. */ - + if ((type != SOCK_STREAM && type != SOCK_DGRAM && type != SOCK_SEQPACKET && type != SOCK_RAW && type != SOCK_RDM && #ifdef CONFIG_XTP @@ -663,9 +663,8 @@ out: asmlinkage int sys_socketpair(int family, int type, int protocol, int usockvec[2]) { - int fd1, fd2, i; - struct socket *sock1=NULL, *sock2=NULL; - int err; + struct socket *sock1, *sock2; + int fd1, fd2, err; lock_kernel(); @@ -674,48 +673,51 @@ asmlinkage int sys_socketpair(int family, int type, int protocol, int usockvec[2 * supports the socketpair call. */ - if ((fd1 = sys_socket(family, type, protocol)) < 0) { - err = fd1; + err = sys_socket(family, type, protocol); + if (err < 0) goto out; - } + fd1 = err; - sock1 = sockfd_lookup(fd1, &err); - if (!sock1) - goto out; /* - * Now grab another socket and try to connect the two together. + * Now grab another socket */ err = -EINVAL; - if ((fd2 = sys_socket(family, type, protocol)) < 0) - { - sys_close(fd1); - goto out; - } + fd2 = sys_socket(family, type, protocol); + if (fd2 < 0) + goto out_close1; - sock2 = sockfd_lookup(fd2,&err); + /* + * Get the sockets for the two fd's + */ + sock1 = sockfd_lookup(fd1, &err); + if (!sock1) + goto out_close2; + sock2 = sockfd_lookup(fd2, &err); if (!sock2) - goto out; - if ((i = sock1->ops->socketpair(sock1, sock2)) < 0) - { - sys_close(fd1); + goto out_put1; + + /* try to connect the two sockets together */ + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) + goto out_put2; + + err = put_user(fd1, &usockvec[0]); + if (err) + goto out_put2; + err = put_user(fd2, &usockvec[1]); + +out_put2: + sockfd_put(sock2); +out_put1: + sockfd_put(sock1); + + if (err) { + out_close2: sys_close(fd2); - err = i; - } - else - { - err = put_user(fd1, &usockvec[0]); - if (!err) - err = put_user(fd2, &usockvec[1]); - if (err) { - sys_close(fd1); - sys_close(fd2); - } + out_close1: + sys_close(fd1); } out: - if(sock1) - sockfd_put(sock1); - if(sock2) - sockfd_put(sock2); unlock_kernel(); return err; } @@ -728,7 +730,7 @@ out: * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ - + asmlinkage int sys_bind(int fd, struct sockaddr *umyaddr, int addrlen) { struct socket *sock; @@ -790,58 +792,54 @@ asmlinkage int sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_ad int len; lock_kernel(); + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + restart: - if ((sock = sockfd_lookup(fd, &err))!=NULL) - { - if (!(newsock = sock_alloc())) - { - err=-EMFILE; - goto out; - } + err = -EMFILE; + if (!(newsock = sock_alloc())) + goto out_put; - inode = newsock->inode; - newsock->type = sock->type; + inode = newsock->inode; + newsock->type = sock->type; - if ((err = sock->ops->dup(newsock, sock)) < 0) - { - sock_release(newsock); - goto out; - } + err = sock->ops->dup(newsock, sock); + if (err < 0) + goto out_release; - err = newsock->ops->accept(sock, newsock, current->files->fd[fd]->f_flags); + err = newsock->ops->accept(sock, newsock, sock->file->f_flags); + if (err < 0) + goto out_release; + newsock = socki_lookup(inode); - if (err < 0) - { - sock_release(newsock); - goto out; - } - newsock = socki_lookup(inode); + if ((err = get_fd(inode)) < 0) + goto out_inval; + newsock->file = current->files->fd[err]; - if ((err = get_fd(inode)) < 0) + if (upeer_sockaddr) + { + /* Handle the race where the accept works and we + then getname after it has closed again */ + if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1)<0) { - sock_release(newsock); - err=-EINVAL; - goto out; + sys_close(err); + goto restart; } + move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen); + } - newsock->file = current->files->fd[err]; - - if (upeer_sockaddr) - { - /* Handle the race where the accept works and we - then getname after it has closed again */ - if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1)<0) - { - sys_close(err); - goto restart; - } - move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen); - } +out_put: + sockfd_put(sock); out: - sockfd_put(sock); - } unlock_kernel(); return err; + +out_inval: + err = -EINVAL; +out_release: + sock_release(newsock); + goto out_put; } @@ -856,7 +854,7 @@ out: * other SEQPACKET protocols that take time to connect() as it doesn't * include the -EINPROGRESS status for such sockets. */ - + asmlinkage int sys_connect(int fd, struct sockaddr *uservaddr, int addrlen) { struct socket *sock; @@ -864,13 +862,17 @@ asmlinkage int sys_connect(int fd, struct sockaddr *uservaddr, int addrlen) int err; lock_kernel(); - if ((sock = sockfd_lookup(fd,&err))!=NULL) - { - if((err=move_addr_to_kernel(uservaddr,addrlen,address))>=0) - err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, - current->files->fd[fd]->f_flags); - sockfd_put(sock); - } + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = move_addr_to_kernel(uservaddr, addrlen, address); + if (err < 0) + goto out_put; + err = sock->ops->connect(sock, (struct sockaddr *) address, addrlen, + sock->file->f_flags); +out_put: + sockfd_put(sock); +out: unlock_kernel(); return err; } @@ -884,16 +886,20 @@ asmlinkage int sys_getsockname(int fd, struct sockaddr *usockaddr, int *usockadd { struct socket *sock; char address[MAX_SOCK_ADDR]; - int len; - int err; + int len, err; lock_kernel(); - if ((sock = sockfd_lookup(fd, &err))!=NULL) - { - if((err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 0))==0) - err=move_addr_to_user(address,len, usockaddr, usockaddr_len); - sockfd_put(sock); - } + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0); + if (err) + goto out_put; + err = move_addr_to_user(address, len, usockaddr, usockaddr_len); + +out_put: + sockfd_put(sock); +out: unlock_kernel(); return err; } @@ -902,7 +908,7 @@ asmlinkage int sys_getsockname(int fd, struct sockaddr *usockaddr, int *usockadd * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */ - + asmlinkage int sys_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_len) { struct socket *sock; @@ -934,27 +940,29 @@ asmlinkage int sys_send(int fd, void * buff, size_t len, unsigned flags) struct iovec iov; lock_kernel(); - if ((sock = sockfd_lookup(fd, &err))!=NULL) - { - if(len>=0) - { - iov.iov_base=buff; - iov.iov_len=len; - msg.msg_name=NULL; - msg.msg_namelen=0; - msg.msg_iov=&iov; - msg.msg_iovlen=1; - msg.msg_control=NULL; - msg.msg_controllen=0; - if (current->files->fd[fd]->f_flags & O_NONBLOCK) - flags |= MSG_DONTWAIT; - msg.msg_flags=flags; - err=sock_sendmsg(sock, &msg, len); - } - else - err=-EINVAL; - sockfd_put(sock); - } + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = -EINVAL; + if (len < 0) + goto out_put; + + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name=NULL; + msg.msg_namelen=0; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); + +out_put: + sockfd_put(sock); +out: unlock_kernel(); return err; } @@ -975,36 +983,37 @@ asmlinkage int sys_sendto(int fd, void * buff, size_t len, unsigned flags, struct iovec iov; lock_kernel(); - if ((sock = sockfd_lookup(fd,&err))!=NULL) + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_namelen=addr_len; + if(addr) { - iov.iov_base=buff; - iov.iov_len=len; - msg.msg_name=NULL; - msg.msg_iov=&iov; - msg.msg_iovlen=1; - msg.msg_control=NULL; - msg.msg_controllen=0; - msg.msg_namelen=addr_len; - if(addr) - { - err=move_addr_to_kernel(addr,addr_len,address); - if (err < 0) - goto bad; - msg.msg_name=address; - } - if (current->files->fd[fd]->f_flags & O_NONBLOCK) - flags |= MSG_DONTWAIT; - msg.msg_flags=flags; - err=sock_sendmsg(sock, &msg, len); -bad: - sockfd_put(sock); + err = move_addr_to_kernel(addr, addr_len, address); + if (err < 0) + goto out_put; + msg.msg_name=address; } + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); + +out_put: + sockfd_put(sock); +out: unlock_kernel(); return err; } - /* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the @@ -1021,26 +1030,30 @@ asmlinkage int sys_recvfrom(int fd, void * ubuf, size_t size, unsigned flags, int err,err2; lock_kernel(); - if ((sock = sockfd_lookup(fd, &err))!=NULL) - { - msg.msg_control=NULL; - msg.msg_controllen=0; - msg.msg_iovlen=1; - msg.msg_iov=&iov; - iov.iov_len=size; - iov.iov_base=ubuf; - msg.msg_name=address; - msg.msg_namelen=MAX_SOCK_ADDR; - err=sock_recvmsg(sock, &msg, size, - (current->files->fd[fd]->f_flags & O_NONBLOCK) ? (flags | MSG_DONTWAIT) : flags); - if(err>=0 && addr!=NULL) - { - err2=move_addr_to_user(address, msg.msg_namelen, addr, addr_len); - if(err2<0) - err=err2; - } - sockfd_put(sock); - } + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_iovlen=1; + msg.msg_iov=&iov; + iov.iov_len=size; + iov.iov_base=ubuf; + msg.msg_name=address; + msg.msg_namelen=MAX_SOCK_ADDR; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err=sock_recvmsg(sock, &msg, size, flags); + + if(err >= 0 && addr != NULL) + { + err2=move_addr_to_user(address, msg.msg_namelen, addr, addr_len); + if(err2<0) + err=err2; + } + sockfd_put(sock); +out: unlock_kernel(); return err; } @@ -1058,7 +1071,7 @@ asmlinkage int sys_recv(int fd, void * ubuf, size_t size, unsigned flags) * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - + asmlinkage int sys_setsockopt(int fd, int level, int optname, char *optval, int optlen) { int err; @@ -1104,7 +1117,7 @@ asmlinkage int sys_getsockopt(int fd, int level, int optname, char *optval, int /* * Shutdown a socket. */ - + asmlinkage int sys_shutdown(int fd, int how) { int err; @@ -1123,7 +1136,7 @@ asmlinkage int sys_shutdown(int fd, int how) /* * BSD sendmsg interface */ - + asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) { struct socket *sock; @@ -1137,63 +1150,59 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) lock_kernel(); + err=-EFAULT; if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) - { - err=-EFAULT; goto out; - } /* do not move before msg_sys is valid */ if (msg_sys.msg_iovlen>UIO_MAXIOV) goto out; /* This will also move the address data into kernel space */ err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); - if (err < 0) + if (err < 0) goto out; + total_len=err; + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + if (msg_sys.msg_controllen) { - /* XXX We just limit the buffer and assume that the - * skbuff accounting stops it from going too far. - * I hope this is correct. - */ - if (msg_sys.msg_controllen > 256) { - err = -EINVAL; - goto failed2; - } if (msg_sys.msg_controllen > sizeof(ctl)) { - ctl_buf = kmalloc(msg_sys.msg_controllen, GFP_KERNEL); + /* Suggested by the Advanced Sockets API for IPv6 draft: + * Limit the msg_controllen size by the SO_SNDBUF size. + */ + /* Note - when this code becomes multithreaded on + * SMP machines you have a race to fix here. + */ + err = -ENOBUFS; + ctl_buf = sock_kmalloc(sock->sk, msg_sys.msg_controllen, + GFP_KERNEL); if (ctl_buf == NULL) - { - err = -ENOBUFS; goto failed2; - } } + err = -EFAULT; if (copy_from_user(ctl_buf, msg_sys.msg_control, - msg_sys.msg_controllen)) { - err = -EFAULT; + msg_sys.msg_controllen)) goto failed; - } msg_sys.msg_control = ctl_buf; } msg_sys.msg_flags = flags; - if ((sock = sockfd_lookup(fd,&err))!=NULL) - { - if (current->files->fd[fd]->f_flags & O_NONBLOCK) - msg_sys.msg_flags |= MSG_DONTWAIT; - err = sock_sendmsg(sock, &msg_sys, total_len); - sockfd_put(sock); - } + if (sock->file->f_flags & O_NONBLOCK) + msg_sys.msg_flags |= MSG_DONTWAIT; + err = sock_sendmsg(sock, &msg_sys, total_len); failed: - if (ctl_buf != ctl) - kfree_s(ctl_buf, msg_sys.msg_controllen); + if (ctl_buf != ctl) + sock_kfree_s(sock->sk, ctl_buf, msg_sys.msg_controllen); failed2: if (msg_sys.msg_iov != iov) kfree(msg_sys.msg_iov); -out: + sockfd_put(sock); +out: unlock_kernel(); return err; } @@ -1201,7 +1210,7 @@ out: /* * BSD recvmsg interface */ - + asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) { struct socket *sock; @@ -1250,7 +1259,7 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) if ((sock = sockfd_lookup(fd, &err))!=NULL) { - if (current->files->fd[fd]->f_flags&O_NONBLOCK) + if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err=sock_recvmsg(sock, &msg_sys, total_len, flags); if(err>=0) @@ -1262,12 +1271,13 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) if (uaddr != NULL && err>=0) err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, uaddr_len); - if (err>=0) { - err = __put_user(msg_sys.msg_flags, &msg->msg_flags); - if (!err) - err = __put_user((unsigned long)msg_sys.msg_control-cmsg_ptr, + if (err < 0) + goto out; + err = __put_user(msg_sys.msg_flags, &msg->msg_flags); + if (err) + goto out; + err = __put_user((unsigned long)msg_sys.msg_control-cmsg_ptr, &msg->msg_controllen); - } out: unlock_kernel(); if(err<0) @@ -1295,8 +1305,8 @@ int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg) /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static unsigned char nargs[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), - AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), - AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)}; + AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), + AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)}; #undef AL /* @@ -1390,7 +1400,7 @@ asmlinkage int sys_socketcall(int call, unsigned long *args) * advertise its address family, and have it linked into the * SOCKET module. */ - + int sock_register(struct net_proto_family *ops) { if (ops->family >= NPROTO) { @@ -1406,7 +1416,7 @@ int sock_register(struct net_proto_family *ops) * remove its address family, and have it unlinked from the * SOCKET module. */ - + int sock_unregister(int family) { if (family < 0 || family >= NPROTO) @@ -1432,6 +1442,9 @@ __initfunc(void proto_init(void)) } extern void sk_init(void); +#ifdef CONFIG_WAN_ROUTER +extern void wanrouter_init(void); +#endif __initfunc(void sock_init(void)) { @@ -1451,7 +1464,14 @@ __initfunc(void sock_init(void)) */ sk_init(); - + +#ifdef SLAB_SKB + /* + * Initialize skbuff SLAB cache + */ + skb_init(); +#endif + /* * Wan router layer. diff --git a/net/sunrpc/.cvsignore b/net/sunrpc/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/sunrpc/.cvsignore +++ b/net/sunrpc/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7abaa691e..e8ca9a511 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -21,9 +21,6 @@ * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/config.h> -#include <linux/module.h> - #include <asm/system.h> #include <asm/segment.h> @@ -72,20 +69,19 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname, struct rpc_program *program, u32 vers, int flavor) { struct rpc_version *version; - struct rpc_clnt *clnt; + struct rpc_clnt *clnt = NULL; dprintk("RPC: creating %s client for %s (xprt %p)\n", - program->name, servname, xprt); + program->name, servname, xprt); if (!xprt) - return NULL; - if (vers>= program->nrvers || !(version = program->version[vers])) - return NULL; + goto out; + if (vers >= program->nrvers || !(version = program->version[vers])) + goto out; - if (!(clnt = (struct rpc_clnt *) rpc_allocate(0, sizeof(*clnt)))) { - printk("RPC: out of memory in rpc_create_client\n"); - return NULL; - } + clnt = (struct rpc_clnt *) rpc_allocate(0, sizeof(*clnt)); + if (!clnt) + goto out_no_clnt; memset(clnt, 0, sizeof(*clnt)); clnt->cl_xprt = xprt; @@ -103,13 +99,20 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname, if (!clnt->cl_port) clnt->cl_autobind = 1; - if (!rpcauth_create(flavor, clnt)) { - printk("RPC: Couldn't create auth handle (flavor %d)\n", - flavor); - rpc_free(clnt); - return NULL; - } + if (!rpcauth_create(flavor, clnt)) + goto out_no_auth; +out: return clnt; + +out_no_clnt: + printk("RPC: out of memory in rpc_create_client\n"); + goto out; +out_no_auth: + printk("RPC: Couldn't create auth handle (flavor %d)\n", + flavor); + rpc_free(clnt); + clnt = NULL; + goto out; } /* @@ -753,8 +756,10 @@ call_verify(struct rpc_task *task) rpc_exit(task, error); return NULL; } - if (!(p = rpcauth_checkverf(task, p))) + if (!(p = rpcauth_checkverf(task, p))) { + printk("call_verify: auth check failed\n"); goto garbage; /* bad verifier, retry */ + } switch ((n = ntohl(*p++))) { case RPC_SUCCESS: return p; @@ -768,7 +773,8 @@ call_verify(struct rpc_task *task) garbage: dprintk("RPC: %4d call_verify: server saw garbage\n", task->tk_pid); task->tk_client->cl_stats->rpcgarbage++; - if (0 && task->tk_garb_retry--) { + if (task->tk_garb_retry--) { + printk("RPC: garbage, retrying %4d\n", task->tk_pid); task->tk_action = call_encode; return NULL; } @@ -776,24 +782,3 @@ garbage: rpc_exit(task, -EIO); return NULL; } - -#ifdef MODULE -int -init_module(void) -{ -#ifdef RPC_DEBUG - rpc_register_sysctl(); -#endif - rpc_proc_init(); - return 0; -} - -void -cleanup_module(void) -{ -#ifdef RPC_DEBUG - rpc_unregister_sysctl(); -#endif - rpc_proc_exit(); -} -#endif diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6e14bb287..765dc05fc 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -16,6 +16,7 @@ #include <linux/unistd.h> #include <linux/smp.h> #include <linux/smp_lock.h> + #include <linux/sunrpc/clnt.h> #ifdef RPC_DEBUG @@ -45,6 +46,11 @@ static struct rpc_wait_queue schedq = RPC_INIT_WAITQ("schedq"); static struct rpc_wait_queue childq = RPC_INIT_WAITQ("childq"); /* + * RPC tasks sit here while waiting for conditions to improve. + */ +static struct rpc_wait_queue delay_queue = RPC_INIT_WAITQ("delayq"); + +/* * All RPC tasks are linked into this list */ static struct rpc_task * all_tasks = NULL; @@ -92,7 +98,8 @@ rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) } /* - * Remove request from queue + * Remove request from queue. + * Note: must be called with interrupts disabled. */ void rpc_remove_wait_queue(struct rpc_task *task) @@ -149,6 +156,9 @@ rpc_del_timer(struct rpc_task *task) /* * Make an RPC task runnable. + * + * Note: If the task is ASYNC, this must be called with + * interrupts disabled to protect the wait queue operation. */ static inline void rpc_make_runnable(struct rpc_task *task) @@ -313,8 +323,6 @@ static void __rpc_atrun(struct rpc_task *); void rpc_delay(struct rpc_task *task, unsigned long delay) { - static struct rpc_wait_queue delay_queue; - task->tk_timeout = delay; rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun); } @@ -388,12 +396,14 @@ __rpc_execute(struct rpc_task *task) /* sync task: sleep here */ dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); + if (current->pid == rpciod_pid) + printk("RPC: rpciod waiting on sync task!\n"); current->timeout = 0; sleep_on(&task->tk_wait); /* When the task received a signal, remove from * any queues etc, and make runnable again. */ - if (signalled()) + if (0 && signalled()) __rpc_wake_up(task); dprintk("RPC: %4d sync task resuming\n", @@ -433,10 +443,15 @@ rpc_execute(struct rpc_task *task) static int executing = 0; int incr = RPC_IS_ASYNC(task)? 1 : 0; - if (incr && (executing || rpc_inhibit)) { - printk("RPC: rpc_execute called recursively!\n"); - return; + if (incr) { + if (rpc_inhibit) { + printk("RPC: execution inhibited!\n"); + return; + } + if (executing) + printk("RPC: %d tasks executed\n", executing); } + executing += incr; __rpc_execute(task); executing -= incr; @@ -519,6 +534,7 @@ rpc_allocate(unsigned int flags, unsigned int size) if (flags & RPC_TASK_ASYNC) return NULL; current->timeout = jiffies + (HZ >> 4); + current->state = TASK_INTERRUPTIBLE; schedule(); } while (!signalled()); @@ -684,20 +700,27 @@ rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent) { struct rpc_task *task; - if (!(task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC|RPC_TASK_CHILD))) { - parent->tk_status = -ENOMEM; - return NULL; - } + task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD); + if (!task) + goto fail; task->tk_exit = rpc_child_exit; task->tk_calldata = parent; - return task; + +fail: + parent->tk_status = -ENOMEM; + return NULL; } void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) { + unsigned long oldflags; + + save_flags(oldflags); cli(); rpc_make_runnable(child); + restore_flags(oldflags); + /* N.B. Is it possible for the child to have already finished? */ rpc_sleep_on(&childq, task, func, NULL); } @@ -711,6 +734,7 @@ rpc_killall_tasks(struct rpc_clnt *clnt) struct rpc_task **q, *rovr; dprintk("RPC: killing all tasks for client %p\n", clnt); + /* N.B. Why bother to inhibit? Nothing blocks here ... */ rpc_inhibit++; for (q = &all_tasks; (rovr = *q); q = &rovr->tk_next_task) { if (!clnt || rovr->tk_client == clnt) { @@ -792,29 +816,21 @@ static void rpciod_killall(void) { unsigned long flags; - sigset_t old_set; - - /* FIXME: What had been going on before was saving and restoring - current->signal. This as opposed to blocking signals? Do we - still need them to wake up out of schedule? In any case it - isn't playing nice and a better way should be found. */ - - spin_lock_irqsave(¤t->sigmask_lock, flags); - old_set = current->blocked; - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); while (all_tasks) { + current->sigpending = 0; rpc_killall_tasks(NULL); __rpc_schedule(); - current->timeout = jiffies + HZ / 100; - need_resched = 1; - schedule(); + if (all_tasks) { +printk("rpciod_killall: waiting for tasks to exit\n"); + current->state = TASK_INTERRUPTIBLE; + current->timeout = jiffies + 1; + schedule(); + current->timeout = 0; + } } spin_lock_irqsave(¤t->sigmask_lock, flags); - current->blocked = old_set; recalc_sigpending(current); spin_unlock_irqrestore(¤t->sigmask_lock, flags); } @@ -901,3 +917,37 @@ out: up(&rpciod_sema); MOD_DEC_USE_COUNT; } + +#ifdef RPC_DEBUG +#include <linux/nfs_fs.h> +void rpc_show_tasks(void) +{ + struct rpc_task *t = all_tasks, *next; + struct nfs_wreq *wreq; + + if (!t) + return; + printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " + "-rpcwait -action- --exit--\n"); + for (; t; t = next) { + next = t->tk_next_task; + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", + t->tk_pid, t->tk_proc, t->tk_flags, t->tk_status, + t->tk_client, t->tk_client->cl_prog, + t->tk_rqstp, t->tk_timeout, + t->tk_rpcwait ? rpc_qname(t->tk_rpcwait) : " <NULL> ", + t->tk_action, t->tk_exit); + + if (!(t->tk_flags & RPC_TASK_NFSWRITE)) + continue; + /* NFS write requests */ + wreq = (struct nfs_wreq *) t->tk_calldata; + printk(" NFS: flgs=%08x, pid=%d, pg=%p, off=(%d, %d)\n", + wreq->wb_flags, wreq->wb_pid, wreq->wb_page, + wreq->wb_offset, wreq->wb_bytes); + printk(" name=%s/%s\n", + wreq->wb_dentry->d_parent->d_name.name, + wreq->wb_dentry->d_name.name); + } +} +#endif diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 90a23a232..94a5ba21c 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -12,6 +12,8 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ +#include <linux/module.h> + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/proc_fs.h> @@ -20,7 +22,7 @@ #define RPCDBG_FACILITY RPCDBG_MISC -static struct proc_dir_entry *proc_net_rpc = 0; +static struct proc_dir_entry *proc_net_rpc = NULL; /* * Get RPC client stats @@ -161,15 +163,61 @@ void rpc_proc_init(void) { dprintk("RPC: registering /proc/net/rpc\n"); - if (!proc_net_rpc) - proc_net_rpc = create_proc_entry("net/rpc", S_IFDIR, 0); + if (!proc_net_rpc) { + struct proc_dir_entry *ent; + ent = create_proc_entry("net/rpc", S_IFDIR, 0); + if (ent) { +#ifdef MODULE + ent->fill_inode = rpc_modcount; +#endif + proc_net_rpc = ent; + } + } } void rpc_proc_exit(void) { dprintk("RPC: unregistering /proc/net/rpc\n"); - if (proc_net_rpc) + if (proc_net_rpc) { + proc_net_rpc = NULL; remove_proc_entry("net/rpc", 0); - proc_net_rpc = 0; + } +} + +#ifdef MODULE +/* + * This is called as the proc_dir_entry fill_inode function + * when an inode is going into or out of service (fill == 1 + * or 0 respectively). + * + * We use it here to keep the module from being unloaded + * while /proc inodes are in use. + */ +void rpc_modcount(struct inode *inode, int fill) +{ + if (fill) + MOD_INC_USE_COUNT; + else + MOD_DEC_USE_COUNT; +} + +int +init_module(void) +{ +#ifdef RPC_DEBUG + rpc_register_sysctl(); +#endif + rpc_proc_init(); + return 0; +} + +void +cleanup_module(void) +{ +#ifdef RPC_DEBUG + rpc_unregister_sysctl(); +#endif + rpc_proc_exit(); } +#endif diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 73f805f40..8c1df5a50 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -62,7 +62,6 @@ EXPORT_SYMBOL(rpcauth_releasecred); /* RPC server stuff */ EXPORT_SYMBOL(svc_create); -EXPORT_SYMBOL(svc_create_socket); EXPORT_SYMBOL(svc_create_thread); EXPORT_SYMBOL(svc_exit_thread); EXPORT_SYMBOL(svc_destroy); @@ -70,6 +69,7 @@ EXPORT_SYMBOL(svc_drop); EXPORT_SYMBOL(svc_process); EXPORT_SYMBOL(svc_recv); EXPORT_SYMBOL(svc_wake_up); +EXPORT_SYMBOL(svc_makesock); /* RPC statistics */ #ifdef CONFIG_PROC_FS diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79882845c..1d1c0a95e 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -172,7 +172,6 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) { struct svc_program *progp; unsigned long flags; - sigset_t old_set; int i, error = 0, dummy; progp = serv->sv_program; @@ -180,18 +179,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) dprintk("RPC: svc_register(%s, %s, %d)\n", progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); - /* FIXME: What had been going on before was saving and restoring - current->signal. This as opposed to blocking signals? Do we - still need them to wake up out of schedule? In any case it - isn't playing nice and a better way should be found. */ - - if (!port) { - spin_lock_irqsave(¤t->sigmask_lock, flags); - old_set = current->blocked; - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); - } + if (!port) + current->sigpending = 0; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) @@ -207,7 +196,6 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) if (!port) { spin_lock_irqsave(¤t->sigmask_lock, flags); - current->blocked = old_set; recalc_sigpending(current); spin_unlock_irqrestore(¤t->sigmask_lock, flags); } @@ -235,7 +223,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) bufp = argp->buf; if (argp->len < 5) - goto dropit; + goto err_short_len; dir = ntohl(*bufp++); vers = ntohl(*bufp++); @@ -244,10 +232,8 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) svc_putlong(resp, xdr_one); /* REPLY */ svc_putlong(resp, xdr_zero); /* ACCEPT */ - if (dir != 0) { /* direction != CALL */ - serv->sv_stats->rpcbadfmt++; - goto dropit; /* drop request */ - } + if (dir != 0) /* direction != CALL */ + goto err_bad_dir; if (vers != 2) /* RPC version number */ goto err_bad_rpc; @@ -281,7 +267,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) - goto err_unknown; + goto err_bad_proc; rqstp->rq_server = serv; rqstp->rq_procinfo = procp; @@ -329,21 +315,36 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) if (procp->pc_release) procp->pc_release(rqstp, NULL, rqstp->rq_resp); - if (procp->pc_encode != NULL) - return svc_send(rqstp); + if (procp->pc_encode == NULL) + goto dropit; +sendit: + return svc_send(rqstp); dropit: dprintk("svc: svc_process dropit\n"); svc_drop(rqstp); return 0; +err_short_len: +#ifdef RPC_PARANOIA + printk("svc: short len %d, dropping request\n", argp->len); +#endif + goto dropit; /* drop request */ + +err_bad_dir: +#ifdef RPC_PARANOIA + printk("svc: bad direction %d, dropping request\n", dir); +#endif + serv->sv_stats->rpcbadfmt++; + goto dropit; /* drop request */ + err_bad_rpc: serv->sv_stats->rpcbadfmt++; resp->buf[-1] = xdr_one; /* REJECT */ svc_putlong(resp, xdr_zero); /* RPC_MISMATCH */ svc_putlong(resp, xdr_two); /* Only RPCv2 supported */ svc_putlong(resp, xdr_two); - goto error; + goto sendit; err_bad_auth: dprintk("svc: authentication failed (%ld)\n", ntohl(auth_stat)); @@ -351,7 +352,7 @@ err_bad_auth: resp->buf[-1] = xdr_one; /* REJECT */ svc_putlong(resp, xdr_one); /* AUTH_ERROR */ svc_putlong(resp, auth_stat); /* status */ - goto error; + goto sendit; err_bad_prog: #ifdef RPC_PARANOIA @@ -359,7 +360,7 @@ err_bad_prog: #endif serv->sv_stats->rpcbadfmt++; svc_putlong(resp, rpc_prog_unavail); - goto error; + goto sendit; err_bad_vers: #ifdef RPC_PARANOIA @@ -369,15 +370,15 @@ err_bad_vers: svc_putlong(resp, rpc_prog_mismatch); svc_putlong(resp, htonl(progp->pg_lovers)); svc_putlong(resp, htonl(progp->pg_hivers)); - goto error; + goto sendit; -err_unknown: +err_bad_proc: #ifdef RPC_PARANOIA printk("svc: unknown procedure (%d)\n", proc); #endif serv->sv_stats->rpcbadfmt++; svc_putlong(resp, rpc_proc_unavail); - goto error; + goto sendit; err_garbage: #ifdef RPC_PARANOIA @@ -385,7 +386,5 @@ err_garbage: #endif serv->sv_stats->rpcbadfmt++; svc_putlong(resp, rpc_garbage_args); - -error: - return svc_send(rqstp); + goto sendit; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2701a8398..cec276857 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -131,10 +131,10 @@ svc_sock_dequeue(struct svc_serv *serv) { struct svc_sock *svsk; - disable_bh(NET_BH); + start_bh_atomic(); if ((svsk = serv->sv_sockets) != NULL) rpc_remove_list(&serv->sv_sockets, svsk); - enable_bh(NET_BH); + end_bh_atomic(); if (svsk) { dprintk("svc: socket %p dequeued\n", svsk->sk_sk); @@ -151,7 +151,7 @@ svc_sock_dequeue(struct svc_serv *serv) static inline void svc_sock_received(struct svc_sock *svsk, int count) { - disable_bh(NET_BH); + start_bh_atomic(); if ((svsk->sk_data -= count) < 0) { printk(KERN_NOTICE "svc: sk_data negative!\n"); svsk->sk_data = 0; @@ -163,7 +163,7 @@ svc_sock_received(struct svc_sock *svsk, int count) svsk->sk_sk); svc_sock_enqueue(svsk); } - enable_bh(NET_BH); + end_bh_atomic(); } /* @@ -172,7 +172,7 @@ svc_sock_received(struct svc_sock *svsk, int count) static inline void svc_sock_accepted(struct svc_sock *svsk) { - disable_bh(NET_BH); + start_bh_atomic(); svsk->sk_busy = 0; svsk->sk_conn--; if (svsk->sk_conn || svsk->sk_data || svsk->sk_close) { @@ -180,7 +180,7 @@ svc_sock_accepted(struct svc_sock *svsk) svsk->sk_sk); svc_sock_enqueue(svsk); } - enable_bh(NET_BH); + end_bh_atomic(); } /* @@ -739,9 +739,9 @@ again: if (signalled()) return -EINTR; - disable_bh(NET_BH); + start_bh_atomic(); if ((svsk = svc_sock_dequeue(serv)) != NULL) { - enable_bh(NET_BH); + end_bh_atomic(); rqstp->rq_sock = svsk; svsk->sk_inuse++; /* N.B. where is this decremented? */ } else { @@ -756,7 +756,7 @@ again: */ current->state = TASK_INTERRUPTIBLE; add_wait_queue(&rqstp->rq_wait, &wait); - enable_bh(NET_BH); + end_bh_atomic(); schedule(); if (!(svsk = rqstp->rq_sock)) { @@ -886,7 +886,7 @@ if (svsk->sk_sk == NULL) /* * Create socket for RPC service. */ -int +static int svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) { struct svc_sock *svsk; @@ -969,3 +969,19 @@ svc_delete_socket(struct svc_sock *svsk) /* svsk->sk_server = NULL; */ } } + +/* + * Make a socket for nfsd and lockd + */ +int +svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); +} + diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 859d55853..a48e9c1ad 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -13,15 +13,11 @@ #include <linux/ctype.h> #include <linux/fs.h> #include <linux/sysctl.h> -#if LINUX_VERSION_CODE >= 0x020100 + #include <asm/uaccess.h> -#else -# include <linux/mm.h> -# define copy_from_user memcpy_fromfs -# define copy_to_user memcpy_tofs -# define access_ok !verify_area -#endif #include <linux/sunrpc/types.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/stats.h> /* * Declare the debug flags here @@ -39,17 +35,23 @@ static ctl_table sunrpc_table[]; void rpc_register_sysctl(void) { - if (sunrpc_table_header) - return; - sunrpc_table_header = register_sysctl_table(sunrpc_table, 1); + if (!sunrpc_table_header) { + sunrpc_table_header = register_sysctl_table(sunrpc_table, 1); +#ifdef MODULE + if (sunrpc_table[0].de) + sunrpc_table[0].de->fill_inode = rpc_modcount; +#endif + } + } void rpc_unregister_sysctl(void) { - if (!sunrpc_table_header) - return; - unregister_sysctl_table(sunrpc_table_header); + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } } int @@ -93,6 +95,10 @@ proc_dodebug(ctl_table *table, int write, struct file *file, while (left && isspace(*p)) left--, p++; *(unsigned int *) table->data = value; + /* Display the RPC tasks on writing to rpc_debug */ + if (table->ctl_name == CTL_RPCDEBUG) { + rpc_show_tasks(); + } } else { if (!access_ok(VERIFY_WRITE, buffer, left)) return -EFAULT; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index c76566399..f614cfa33 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -48,11 +48,11 @@ #include <linux/udp.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> +#include <linux/file.h> + #include <net/sock.h> -#if LINUX_VERSION_CODE >= 0x020100 #include <asm/uaccess.h> -#endif #define SOCK_HAS_USER_DATA @@ -319,7 +319,7 @@ xprt_close(struct rpc_xprt *xprt) sk->write_space = xprt->old_write_space; if (xprt->file) - close_fp(xprt->file); + fput(xprt->file); else sock_release(xprt->sock); } @@ -397,14 +397,14 @@ xprt_reconnect(struct rpc_task *task) task->tk_pid, status, xprt->connected); task->tk_timeout = 60 * HZ; - disable_bh(NET_BH); + start_bh_atomic(); if (!xprt->connected) { rpc_sleep_on(&xprt->reconn, task, xprt_reconn_status, xprt_reconn_timeout); - enable_bh(NET_BH); + end_bh_atomic(); return; } - enable_bh(NET_BH); + end_bh_atomic(); } xprt->connecting = 0; @@ -870,10 +870,10 @@ xprt_transmit(struct rpc_task *task) /* For fast networks/servers we have to put the request on * the pending list now: */ - disable_bh(NET_BH); + start_bh_atomic(); rpc_add_wait_queue(&xprt->pending, task); task->tk_callback = NULL; - enable_bh(NET_BH); + end_bh_atomic(); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have @@ -891,16 +891,16 @@ xprt_transmit(struct rpc_task *task) task->tk_pid, xprt->snd_buf.io_len, req->rq_slen); task->tk_status = 0; - disable_bh(NET_BH); + start_bh_atomic(); if (!xprt->write_space) { /* Remove from pending */ rpc_remove_wait_queue(task); rpc_sleep_on(&xprt->sending, task, xprt_transmit_status, NULL); - enable_bh(NET_BH); + end_bh_atomic(); return; } - enable_bh(NET_BH); + end_bh_atomic(); } } @@ -943,12 +943,12 @@ xprt_receive(struct rpc_task *task) */ task->tk_timeout = req->rq_timeout.to_current; - disable_bh(NET_BH); + start_bh_atomic(); if (!req->rq_gotit) { rpc_sleep_on(&xprt->pending, task, xprt_receive_status, xprt_timer); } - enable_bh(NET_BH); + end_bh_atomic(); dprintk("RPC: %4d xprt_receive returns %d\n", task->tk_pid, task->tk_status); @@ -1079,7 +1079,7 @@ xprt_release(struct rpc_task *task) dprintk("RPC: %4d release request %p\n", task->tk_pid, req); /* remove slot from queue of pending */ - disable_bh(NET_BH); + start_bh_atomic(); if (task->tk_rpcwait) { printk("RPC: task of released request still queued!\n"); #ifdef RPC_DEBUG @@ -1088,7 +1088,7 @@ xprt_release(struct rpc_task *task) rpc_del_timer(task); rpc_remove_wait_queue(task); } - enable_bh(NET_BH); + end_bh_atomic(); /* Decrease congestion value. If congestion threshold is not yet * reached, pass on the request slot. diff --git a/net/unix/.cvsignore b/net/unix/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/unix/.cvsignore +++ b/net/unix/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1a9baa549..2fbce16fe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -286,14 +286,14 @@ static void unix_destroy_socket(unix_socket *sk) { unix_socket *osk=skb->sk; osk->state=TCP_CLOSE; - kfree_skb(skb, FREE_WRITE); /* Now surplus - free the skb first before the socket */ + kfree_skb(skb); /* Now surplus - free the skb first before the socket */ osk->state_change(osk); /* So the connect wakes and cleans up (if any) */ /* osk will be destroyed when it gets to close or the timer fires */ } else { /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb,FREE_WRITE); + kfree_skb(skb); } } @@ -695,7 +695,7 @@ static int unix_stream_connect1(struct socket *sock, struct msghdr *msg, other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err); if(other==NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } other->ack_backlog++; @@ -819,7 +819,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) { tsk=skb->sk; tsk->state_change(tsk); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); continue; } break; @@ -838,7 +838,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) unix_lock(newsk); /* Swap lock over */ unix_unlock(sk); /* Locked to child socket not master */ unix_lock(tsk); /* Back lock */ - kfree_skb(skb, FREE_WRITE); /* The buffer is just used as a tag */ + kfree_skb(skb); /* The buffer is just used as a tag */ tsk->state_change(tsk); /* Wake up any sleeping connect */ sock_wake_async(tsk->socket, 0); return 0; @@ -958,7 +958,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, unix_peer(sk)=NULL; other = NULL; if (sunaddr == NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ECONNRESET; } } @@ -968,13 +968,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (other==NULL) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return err; } if (!unix_may_send(sk, other)) { unix_unlock(other); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -EINVAL; } } @@ -1033,8 +1033,9 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, size=len-sent; - if (size>(sk->sndbuf-sizeof(struct sk_buff))/2) /* Keep two messages in the pipe so it schedules better */ - size=(sk->sndbuf-sizeof(struct sk_buff))/2; + /* Keep two messages in the pipe so it schedules better */ + if (size > (sk->sndbuf - sizeof(struct sk_buff)) / 2) + size = (sk->sndbuf - sizeof(struct sk_buff)) / 2; /* * Keep to page sized kmalloc()'s as various people @@ -1056,7 +1057,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (skb==NULL) { if (sent) - return sent; + goto out; return err; } @@ -1074,15 +1075,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (scm->fp) unix_attach_fds(scm, skb); + /* N.B. this could fail with -EFAULT */ memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size); other=unix_peer(sk); if (other->dead || (sk->shutdown & SEND_SHUTDOWN)) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if(sent) - return sent; + goto out; send_sig(SIGPIPE,current,0); return -EPIPE; } @@ -1091,6 +1093,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, other->data_ready(other,size); sent+=size; } +out: return sent; } @@ -1121,20 +1124,20 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, msg->msg_namelen = 0; - skb=skb_recv_datagram(sk, flags, noblock, &err); - if(skb==NULL) - return err; + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; if (msg->msg_name) { + msg->msg_namelen = sizeof(short); if (skb->sk->protinfo.af_unix.addr) { - memcpy(msg->msg_name, skb->sk->protinfo.af_unix.addr->name, - skb->sk->protinfo.af_unix.addr->len); msg->msg_namelen=skb->sk->protinfo.af_unix.addr->len; + memcpy(msg->msg_name, + skb->sk->protinfo.af_unix.addr->name, + skb->sk->protinfo.af_unix.addr->len); } - else - msg->msg_namelen=sizeof(short); } if (size > skb->len) @@ -1142,8 +1145,9 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, else if (size < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size)) - return -EFAULT; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + if (err) + goto out_free; scm->creds = *UNIXCREDS(skb); @@ -1169,8 +1173,12 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, if (UNIXCB(skb).fp) scm->fp = scm_fp_dup(UNIXCB(skb).fp); } + err = size; + +out_free: skb_free_datagram(sk,skb); - return size; +out: + return err; } @@ -1189,7 +1197,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size if (flags&MSG_OOB) return -EOPNOTSUPP; - if(flags&MSG_WAITALL) + if (flags&MSG_WAITALL) target = size; @@ -1245,18 +1253,19 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size /* Copy address just once */ if (sunaddr) { + msg->msg_namelen = sizeof(short); if (skb->sk->protinfo.af_unix.addr) { - memcpy(sunaddr, skb->sk->protinfo.af_unix.addr->name, - skb->sk->protinfo.af_unix.addr->len); msg->msg_namelen=skb->sk->protinfo.af_unix.addr->len; + memcpy(sunaddr, + skb->sk->protinfo.af_unix.addr->name, + skb->sk->protinfo.af_unix.addr->len); } - else - msg->msg_namelen=sizeof(short); sunaddr = NULL; } chunk = min(skb->len, size); + /* N.B. This could fail with -EFAULT */ memcpy_toiovec(msg->msg_iov, skb->data, chunk); copied += chunk; size -= chunk; @@ -1280,7 +1289,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size break; } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (scm->fp) break; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d68f018fd..703fdd41e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -44,11 +44,13 @@ #include <linux/malloc.h> #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <linux/file.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> + #include <net/sock.h> #include <net/tcp.h> #include <net/af_unix.h> -#include <linux/proc_fs.h> -#include <linux/vmalloc.h> #include <net/scm.h> /* Internal data structures and random procedures: */ @@ -275,7 +277,7 @@ tail: */ if(s->socket && s->socket->file && s->socket->file->f_count) - close_fp(s->socket->file); + fput(s->socket->file); } else s->protinfo.af_unix.marksweep&=~MARKED; /* unmark everything for next collection */ diff --git a/net/wanrouter/.cvsignore b/net/wanrouter/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/wanrouter/.cvsignore +++ b/net/wanrouter/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 66b99dedc..f92ac29bb 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -18,6 +18,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ +* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 * Jun 27, 1997 Alan Cox realigned with vendor code * Jan 16, 1997 Gene Kozin router_devlist made public * Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 @@ -26,7 +27,6 @@ #include <linux/stddef.h> /* offsetof(), etc. */ #include <linux/errno.h> /* return codes */ -#include <linux/config.h> /* OS configuration options */ #include <linux/kernel.h> #include <linux/module.h> /* support for loadable modules */ #include <linux/malloc.h> /* kmalloc(), kfree() */ @@ -79,8 +79,10 @@ static int delete_interface (wan_device_t* wandev, char* name, int forse); * Global Data */ +#ifdef MODULE static char fullname[] = "WAN Router"; static char copyright[] = "(c) 1995-1997 Sangoma Technologies Inc."; +#endif static char modname[] = ROUTER_NAME; /* short module name */ wan_device_t * router_devlist = NULL; /* list of registered devices */ static int devcnt = 0; @@ -90,7 +92,9 @@ static int devcnt = 0; */ static unsigned char oui_ether[] = { 0x00, 0x00, 0x00 }; +#if 0 static unsigned char oui_802_2[] = { 0x00, 0x80, 0xC2 }; +#endif #ifdef MODULE @@ -279,9 +283,10 @@ int wanrouter_encapsulate (struct sk_buff* skb, struct device* dev) case ETH_P_IPX: /* SNAP encapsulation */ case ETH_P_ARP: - hdr_len += 6; + hdr_len += 7; skb_push(skb, 6); - skb->data[0] = NLPID_SNAP; + skb->data[0] = 0; + skb->data[1] = NLPID_SNAP; memcpy(&skb->data[1], oui_ether, sizeof(oui_ether)); *((unsigned short*)&skb->data[4]) = htons(skb->protocol); break; diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index 937c50076..088487077 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -56,7 +56,7 @@ typedef struct wan_stat_entry /****** Function Prototypes *************************************************/ /* Proc filesystem interface */ -static int router_proc_perms (struct inode*, int); +static int router_proc_perms(struct inode *, int); static ssize_t router_proc_read(struct file* file, char* buf, size_t count, loff_t *ppos); /* Methods for preparing data for reading proc entries */ @@ -118,6 +118,7 @@ static struct inode_operations router_inode = NULL, /* rmdir */ NULL, /* mknod */ NULL, /* rename */ + NULL, /* follow link */ NULL, /* readlink */ NULL, /* readpage */ NULL, /* writepage */ @@ -157,6 +158,7 @@ static struct inode_operations wandev_inode = NULL, /* mknod */ NULL, /* rename */ NULL, /* readlink */ + NULL, /* follow_link */ NULL, /* readpage */ NULL, /* writepage */ NULL, /* bmap */ @@ -339,7 +341,7 @@ static int router_proc_perms (struct inode* inode, int op) static ssize_t router_proc_read(struct file* file, char* buf, size_t count, loff_t *ppos) { - struct inode *inode; + struct inode *inode = file->f_dentry->d_inode; struct proc_dir_entry* dent; char* page; int pos, offs, len; diff --git a/net/x25/.cvsignore b/net/x25/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/x25/.cvsignore +++ b/net/x25/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index bc473e317..7e3c9cae2 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -323,7 +323,7 @@ void x25_destroy_socket(struct sock *sk) /* Not static as it's used by the timer skb->sk->protinfo.x25->state = X25_STATE_0; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } if (atomic_read(&sk->wmem_alloc) != 0 || atomic_read(&sk->rmem_alloc) != 0) { @@ -714,11 +714,13 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags) newsk = skb->sk; newsk->pair = NULL; + newsk->socket = newsock; + newsk->sleep = &newsock->wait; sti(); /* Now attach up the new socket */ skb->sk = NULL; - kfree_skb(skb, FREE_READ); + kfree_skb(skb); sk->ack_backlog--; newsock->sk = newsk; @@ -952,7 +954,7 @@ static int x25_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n"); if (sk->state != TCP_ESTABLISHED) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return -ENOTCONN; } diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index c8ffb33ef..16fc3677d 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -54,7 +54,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *neigh) unsigned int lci; if (call_in_firewall(PF_X25, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -90,7 +90,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *neigh) /* x25_transmit_clear_request(neigh, lci, 0x0D); */ - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -106,7 +106,7 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct device *dev, struct packe */ if ((neigh = x25_get_neigh(dev)) == NULL) { printk(KERN_DEBUG "X.25: unknown neighbour - %s\n", dev->name); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -117,20 +117,20 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct device *dev, struct packe case 0x01: x25_link_established(neigh); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; case 0x02: x25_link_terminated(neigh); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; case 0x03: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; default: - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } } @@ -146,7 +146,7 @@ int x25_llc_receive_frame(struct sk_buff *skb, struct device *dev, struct packet */ if ((neigh = x25_get_neigh(dev)) == NULL) { printk(KERN_DEBUG "X.25: unknown_neighbour - %s\n", dev->name); - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } @@ -178,7 +178,6 @@ void x25_establish_link(struct x25_neigh *neigh) skb->protocol = htons(ETH_P_X25); skb->dev = neigh->dev; - skb->arp = 1; dev_queue_xmit(skb); } @@ -208,7 +207,6 @@ void x25_terminate_link(struct x25_neigh *neigh) skb->protocol = htons(ETH_P_X25); skb->dev = neigh->dev; - skb->arp = 1; dev_queue_xmit(skb); } @@ -225,17 +223,16 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *neigh) #if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) case ARPHRD_ETHER: - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; #endif default: - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } skb->protocol = htons(ETH_P_X25); skb->dev = neigh->dev; - skb->arp = 1; dev_queue_xmit(skb); } diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 1c4cb3bc7..b9a66103c 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -64,12 +64,12 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) skbo = skb_dequeue(&sk->protinfo.x25->fragment_queue); memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); while ((skbo = skb_dequeue(&sk->protinfo.x25->fragment_queue)) != NULL) { skb_pull(skbo, (sk->protinfo.x25->neighbour->extended) ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN); memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); + kfree_skb(skbo); } sk->protinfo.x25->fraglen = 0; diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 1742d802f..f27fa4f4a 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -225,7 +225,7 @@ void x25_transmit_clear_request(struct x25_neigh *neigh, unsigned int lci, unsig void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *neigh) { if (call_fw_firewall(PF_X25, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return; } @@ -304,7 +304,7 @@ static void x25_remove_neigh(struct x25_neigh *x25_neigh) struct sk_buff *skb; while ((skb = skb_dequeue(&x25_neigh->queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); x25_stop_t20timer(x25_neigh); diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c index 5283092a1..24fdf4d47 100644 --- a/net/x25/x25_out.c +++ b/net/x25/x25_out.c @@ -98,7 +98,7 @@ void x25_output(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->write_queue, skbn); } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } else { skb_queue_tail(&sk->write_queue, skb); } diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 52e5be0cb..8b055e40e 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -48,19 +48,19 @@ void x25_clear_queues(struct sock *sk) struct sk_buff *skb; while ((skb = skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.x25->ack_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.x25->interrupt_in_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.x25->interrupt_out_queue)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&sk->protinfo.x25->fragment_queue)) != NULL) - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } @@ -82,7 +82,7 @@ void x25_frames_acked(struct sock *sk, unsigned short nr) if (sk->protinfo.x25->va != nr) { while (skb_peek(&sk->protinfo.x25->ack_queue) != NULL && sk->protinfo.x25->va != nr) { skb = skb_dequeue(&sk->protinfo.x25->ack_queue); - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); sk->protinfo.x25->va = (sk->protinfo.x25->va + 1) % modulus; } } |