Merge with Linux 2.1.72, part 2.

The new signal code with exception of the code for the rt signals. The definitions in <asm/siginfo.h> and <asm/ucontext.h> are currently just stolen from the Alpha and will need to be overhauled.
author: Ralf Baechle <ralf@linux-mips.org> 1997-12-16 06:06:25 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1997-12-16 06:06:25 +0000
commit: aa944aa3453e47706685bc562711a9e87375941e (patch)
tree: 8fb37a65f205a90412917ca2b91c429263ef1790 /net/sched
parent: 967c65a99059fd459b956c1588ce0ba227912c4e (diff)
10 files changed, 3497 insertions, 0 deletions
diff --git a/net/sched/.cvsignore b/net/sched/.cvsignore
new file mode 100644
index 000000000..4671378ae
--- /dev/null
+++ b/net/sched/.cvsignore
@@ -0,0 +1 @@
+.depend
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000..cbb6704c1
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,71 @@
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := sched.o
+
+O_OBJS	:= sch_generic.o
+
+ifeq ($(CONFIG_NET_SCH_CBQ), y)
+O_OBJS += sch_cbq.o
+else
+  ifeq ($(CONFIG_NET_SCH_CBQ), m)
+	M_OBJS += sch_cbq.o
+  endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_CSZ), y)
+O_OBJS += sch_csz.o
+else
+  ifeq ($(CONFIG_NET_SCH_CSZ), m)
+	M_OBJS += sch_csz.o
+  endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_SFQ), y)
+O_OBJS += sch_sfq.o
+else
+  ifeq ($(CONFIG_NET_SCH_SFQ), m)
+	M_OBJS += sch_sfq.o
+  endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_RED), y)
+O_OBJS += sch_red.o
+else
+  ifeq ($(CONFIG_NET_SCH_RED), m)
+	M_OBJS += sch_red.o
+  endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_TBF), y)
+O_OBJS += sch_tbf.o
+else
+  ifeq ($(CONFIG_NET_SCH_TBF), m)
+	M_OBJS += sch_tbf.o
+  endif
+endif
+
+
+ifeq ($(CONFIG_NET_SCH_PFIFO), y)
+O_OBJS += sch_fifo.o
+else
+  ifeq ($(CONFIG_NET_SCH_PFIFO), m)
+	M_OBJS += sch_fifo.o
+  endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_PRIO), y)
+O_OBJS += sch_prio.o
+else
+  ifeq ($(CONFIG_NET_SCH_PRIO), m)
+	M_OBJS += sch_prio.o
+  endif
+endif
+
+include $(TOPDIR)/Rules.make
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000..626afe555
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,839 @@
+/*
+ * net/sched/sch_cbq.c	Class-Based Queueing discipline.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/*	Class-Based Queueing (CBQ) algorithm.
+	=======================================
+
+	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+	         Management Models for Packet Networks",
+		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+	         [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
+
+	         [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+		 Parameters", 1996
+
+	Algorithm skeleton is taken from from NS simulator cbq.cc.
+
+	-----------------------------------------------------------------------
+
+	Differences from NS version.
+
+	--- WRR algorith is different. Our version looks more reasonable :-)
+	and fair when quanta are allowed to be less than MTU.
+
+	--- cl->aveidle is REALLY limited from below by cl->minidle.
+	Seems, it was bug in NS.
+
+	--- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel".
+	When depth increases we expect, that the thing becomes lower, does not it? :-)
+	Besides that, "depth" word is semantically overloaded ---
+	"token bucket depth", "sfq depth"... Besides that, the algorithm
+	was called "top-LEVEL sharing".
+
+	PROBLEM.
+
+	--- Linux has no EOI event at the moment, so that we cannot
+	estimate true class idle time. Three workarounds are possible,
+	all of them have drawbacks:
+
+	1. (as now) Consider the next dequeue event as sign that
+	previous packet is finished. It is wrong because of ping-pong
+	buffers, but on permanently loaded link it is true.
+	2. (NS approach) Use as link busy time estimate skb->leb/"physical
+	bandwidth". Even more wrong f.e. on ethernet real busy time much
+	higher because of collisions.
+	3. (seems, the most clever) Split net bh to two parts:
+	NETRX_BH (for received packets) and preserve NET_BH for transmitter.
+	It will not require driver changes (NETRX_BH flag will be set
+	in netif_rx), but will allow to trace EOIs more precisely
+	and will save useless checks in net_bh. Besides that we will
+	have to eliminate random calling hard_start_xmit with dev->tbusy flag
+	(done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit
+	MUST succeed; failed packets will be dropped on the floor.
+*/
+
+#define CBQ_TOPLEVEL_SHARING
+/* #define CBQ_NO_TRICKERY */
+
+#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class)
+
+struct cbq_class
+{
+/* Parameters */
+	int			priority;	/* priority */
+#ifdef CBQ_TOPLEVEL_SHARING
+	int			level;		/* level of the class in hierarchy:
+						   0 for leaf classes, and maximal
+						   level of childrens + 1 for nodes.
+						 */
+#endif
+
+	long			maxidle;	/* Class paramters: see below. */
+	long			minidle;
+	int			filter_log;
+#ifndef CBQ_NO_TRICKERY
+	long			extradelay;
+#endif
+
+	long			quantum;	/* Allotment per WRR round */
+	long			rquantum;	/* Relative allotment: see below */
+
+	int			cell_log;
+	unsigned long		L_tab[256];
+
+	struct Qdisc		*qdisc;		/* ptr to CBQ discipline */
+	struct cbq_class	*root;		/* Ptr to root class;
+						   root can be not unique.
+						 */
+	struct cbq_class	*parent;	/* Ptr to parent in the class tree */
+	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
+						   parent otherwise */
+
+	struct Qdisc		*q;		/* Elementary queueing discipline */
+	struct cbq_class	*next;		/* next class in this priority band */
+
+	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
+
+/* Variables */
+	psched_time_t		last;
+	psched_time_t		undertime;
+	long			avgidle;
+	long			deficit;	/* Saved deficit for WRR */
+	char			awake;		/* Class is in alive list */
+
+#if 0
+	void			(*overlimit)(struct cbq_class *cl);
+#endif
+};
+
+#define L2T(cl,len)	((cl)->L_tab[(len)>>(cl)->cell_log])
+
+struct cbq_sched_data
+{
+	struct cbq_class	*classes[CBQ_MAXPRIO];	/* List of all classes */
+	int			nclasses[CBQ_MAXPRIO];
+	unsigned		quanta[CBQ_MAXPRIO];
+	unsigned		mtu;
+	int			cell_log;
+	unsigned long		L_tab[256];
+	struct cbq_class	*fallback_class;
+
+	unsigned		activemask;
+	struct cbq_class	*active[CBQ_MAXPRIO];	/* List of all classes
+							   with backlog */
+	struct cbq_class	*last_sent;
+	int			last_sent_len;
+
+	psched_time_t		now;		/* Cached timestamp */
+
+	struct timer_list	wd_timer;	/* Wathchdog timer, that
+						   started when CBQ has
+						   backlog, but cannot
+						   transmit just now */
+	unsigned long		wd_expires;
+#ifdef CBQ_TOPLEVEL_SHARING
+	struct cbq_class	*borrowed;
+	int			toplevel;
+#endif
+};
+
+/*
+   WRR quanta
+   ----------
+
+   cl->quantum is number added to class allotment on every round.
+   cl->rquantum is "relative" quantum.
+
+   For real-time classes:
+
+   cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio]
+
+   where q->quanta[prio] is sum of all rquanta for given priority.
+   cl->rquantum can be identified with absolute rate of the class
+   in arbitrary units (f.e. bytes/sec)
+
+   In this case, delay introduced by round-robin was estimated by
+   Sally Floyd [2] as:
+
+   D = q->nclasses*q->mtu/(bandwidth/2)
+
+   Note, that D does not depend on class rate (it is very bad),
+   but not much worse than Gallager-Parekh estimate for CSZ
+   C/R = q->mtu/rate, when real-time classes have close rates.
+
+   For not real-time classes this folmula is not necessary,
+   so that cl->quantum can be set to any reasonable not zero value.
+   Apparently, it should be proportional to class rate, if the
+   rate is not zero.
+*/
+
+/*
+   maxidle, minidle, extradelay
+   ----------------------------
+
+   CBQ estimator calculates smoothed class idle time cl->aveidle,
+   considering class as virtual interface with corresponding bandwidth.
+   When cl->aveidle wants to be less than zero, class is overlimit.
+   When it is positive, class is underlimit.
+
+   * maxidle bounds aveidle from above.
+     It controls maximal length of burst in this class after
+     long period of idle time. Burstness of active class
+     is controlled by filter constant cl->filter_log,
+     but this number is related to burst length only indirectly.
+
+   * minidle is a negative number, normally set to zero.
+     Setting it to not zero value allows avgidle to drop
+     below zero, effectively penalizing class, when it is overlimit.
+     When the class load will decrease, it will take a time to
+     raise negative avgidle to put the class at limit.
+     It should be set to zero for leaf classes.
+
+   * extradelay is penalty in delay, when a class goes overlimit.
+     I believe this parameter is useless and confusing.
+     Setting it to not zero forces class to accumulate
+     its "idleness" for extradelay and then send BURST of packets
+     until going to overlimit again. Non-sense.
+
+   For details see [1] and [3].
+
+   Really, minidle and extradelay are irrelevant to real scheduling
+   task. As I understand, SF&VJ introduced them to experiment
+   with CBQ simulator in attempts to fix erratic behaviour
+   of ancestor-only (and, partially, top-level) algorithm.
+
+   WARNING.
+
+   User passes them measured in usecs, but cl->minidle,
+   cl->maxidle and cl->aveidle are scaled with cl->filter_log
+   in the text of the scheduler.
+*/
+
+/*
+   A packet has just been enqueued on the empty class.
+   cbq_wakeup_class adds it to the tail of active class list
+   of its priority band.
+ */
+
+static __inline__ void cbq_wakeup_class(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
+	int prio = cl->priority;
+	struct cbq_class *cl_tail;
+
+	cl->awake = 1;
+
+	cl_tail = q->active[prio];
+	q->active[prio] = cl;
+
+	if (cl_tail != NULL) {
+		cl->next_alive = cl_tail->next_alive;
+		cl->deficit = 0;
+	} else {
+		cl->next_alive = cl;
+		q->activemask |= (1<<prio);
+		cl->deficit = cl->quantum;
+	}
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+	struct cbq_class *cl = CBQ_CLASSIFIER(skb, q);
+
+	if (cl->q->enqueue(skb, cl->q) == 1) {
+		sch->q.qlen++;
+
+#ifdef CBQ_TOPLEVEL_SHARING
+		if (q->toplevel > 0) {
+			psched_time_t now;
+			PSCHED_GET_TIME(now);
+			if (PSCHED_TLESS(cl->undertime, now))
+				q->toplevel = 0;
+			else if (q->toplevel > 1 && cl->borrow &&
+				 PSCHED_TLESS(cl->borrow->undertime, now))
+				q->toplevel = 1;
+		}
+#endif
+		if (!cl->awake)
+			cbq_wakeup_class(cl);
+		return 1;
+	}
+	return 0;
+}
+
+static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+	long delay;
+
+	delay = PSCHED_TDIFF(cl->undertime, q->now);
+	if (q->wd_expires == 0 || q->wd_expires - delay > 0)
+		q->wd_expires = delay;
+}
+
+static void cbq_watchdog(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc*)arg;
+	struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
+
+	q->wd_timer.expires = 0;
+	q->wd_timer.function = NULL;
+	qdisc_wakeup(sch->dev);
+}
+
+static __inline__ void
+cbq_update(struct cbq_sched_data *q)
+{
+	struct cbq_class *cl;
+
+	for (cl = q->last_sent; cl; cl = cl->parent) {
+		long avgidle = cl->avgidle;
+		long idle;
+
+		/*
+		   (now - last) is total time between packet right edges.
+		   (last_pktlen/rate) is "virtual" busy time, so that
+
+		         idle = (now - last) - last_pktlen/rate
+		 */
+
+		idle = PSCHED_TDIFF(q->now, cl->last)
+			- L2T(cl, q->last_sent_len);
+
+		/* true_avgidle := (1-W)*true_avgidle + W*idle,
+		   where W=2^{-filter_log}. But cl->avgidle is scaled:
+		   cl->avgidle == true_avgidle/W,
+		   hence:
+		 */
+		avgidle += idle - (avgidle>>cl->filter_log);
+
+		if (avgidle <= 0) {
+			/* Overlimit or at-limit */
+#ifdef CBQ_NO_TRICKERY
+			avgidle = 0;
+#else
+			if (avgidle < cl->minidle)
+				avgidle = cl->minidle;
+#endif
+
+			/* This line was missing in NS. */
+			cl->avgidle = avgidle;
+
+			/* Calculate expected time, when this class
+			   will be allowed to send.
+			   It will occur, when:
+			   (1-W)*true_avgidle + W*delay = 0, i.e.
+			   idle = (1/W - 1)*(-true_avgidle)
+			   or
+			   idle = (1 - W)*(-cl->avgidle);
+
+			   That is not all.
+			   We want to set undertime to the moment, when
+			   the class is allowed to start next transmission i.e.
+			   (undertime + next_pktlen/phys_bandwidth)
+			   - now - next_pktlen/rate = idle
+			   or
+			   undertime = now + idle + next_pktlen/rate
+			   - next_pktlen/phys_bandwidth
+
+			   We do not know next packet length, but can
+			   estimate it with average packet length
+			   or current packet_length.
+			 */
+
+			idle = (-avgidle) - ((-avgidle) >> cl->filter_log);
+			idle += L2T(q, q->last_sent_len);
+			idle -= L2T(cl, q->last_sent_len);
+			PSCHED_TADD2(q->now, idle, cl->undertime);
+#ifndef CBQ_NO_TRICKERY
+			/* Do not forget extra delay :-) */
+			PSCHED_TADD(cl->undertime, cl->extradelay);
+#endif
+		} else {
+			/* Underlimit */
+
+			PSCHED_SET_PASTPERFECT(cl->undertime);
+			if (avgidle > cl->maxidle)
+				cl->avgidle = cl->maxidle;
+			else
+				cl->avgidle = avgidle;
+		}
+		cl->last = q->now;
+	}
+
+#ifdef CBQ_TOPLEVEL_SHARING
+	cl = q->last_sent;
+
+	if (q->borrowed && q->toplevel >= q->borrowed->level) {
+		if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime))
+			q->toplevel = CBQ_MAXLEVEL;
+		else if (q->borrowed != cl)
+			q->toplevel = q->borrowed->level;
+	}
+#endif
+
+	q->last_sent = NULL;
+}
+
+static __inline__ int
+cbq_under_limit(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
+	struct cbq_class *this_cl = cl;
+
+	if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL)
+		return 1;
+
+	if (PSCHED_TLESS(cl->undertime, q->now)) {
+		q->borrowed = cl;
+		return 1;
+	}
+
+	while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+	       PSCHED_TLESS(q->now, cl->undertime)) {
+		cl = cl->borrow;
+		if (cl == NULL
+#ifdef CBQ_TOPLEVEL_SHARING
+		    || cl->level > q->toplevel
+#endif
+		    ) {
+#if 0
+			this_cl->overlimit(this_cl);
+#else
+			cbq_delay(q, this_cl);
+#endif
+			return 0;
+		}
+	}
+	q->borrowed = cl;
+	return 1;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+	struct cbq_class *cl_tail, *cl_prev, *cl;
+	struct sk_buff *skb;
+	int deficit;
+
+	cl_tail = cl_prev = q->active[prio];
+	cl = cl_prev->next_alive;
+
+	do {
+		deficit = 0;
+
+		/* Start round */
+		do {
+			/* Class is empty */
+			if (cl->q->q.qlen == 0) 
+				goto skip_class;
+			
+			if (fallback) {
+				/* Fallback pass: all classes are overlimit;
+				   we send from the first class that is allowed
+				   to borrow.
+				 */
+
+				if (cl->borrow == NULL)
+					goto skip_class;
+			} else {
+				/* Normal pass: check that class is under limit */
+				if (!cbq_under_limit(cl))
+					goto skip_class;
+			}
+
+			if (cl->deficit <= 0) {
+				/* Class exhausted its allotment per this
+				   round.
+				 */
+				deficit = 1;
+				goto next_class;
+			}
+
+			skb = cl->q->dequeue(cl->q);
+
+			/* Class did not give us any skb :-(
+			   It could occur if cl->q == "tbf"
+			 */
+			if (skb == NULL)
+				goto skip_class;
+
+			cl->deficit -= skb->len;
+			q->last_sent = cl;
+			q->last_sent_len = skb->len;
+
+			if (cl->deficit <= 0) {
+				q->active[prio] = cl;
+				cl = cl->next_alive;
+				cl->deficit += cl->quantum;
+			}
+			return skb;
+
+skip_class:
+			cl->deficit = 0;
+
+			if (cl->q->q.qlen == 0) {
+				/* Class is empty, declare it dead */
+				cl_prev->next_alive = cl->next_alive;
+				cl->awake = 0;
+
+				/* Did cl_tail point to it? */
+				if (cl == cl_tail) {
+					/* Repair it! */
+					cl_tail = cl_prev;
+
+					/* Was it the last class in this band? */
+					if (cl == cl_tail) {
+						/* Kill the band! */
+						q->active[prio] = NULL;
+						q->activemask &= ~(1<<prio);
+						return NULL;
+					}
+				}
+			}
+
+next_class:
+			cl_prev = cl;
+			cl = cl->next_alive;
+			cl->deficit += cl->quantum;
+		} while (cl_prev != cl_tail);
+	} while (deficit);
+
+	q->active[prio] = cl_prev;
+	
+	return NULL;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch, int fallback)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+	struct sk_buff *skb;
+	unsigned activemask;
+
+	activemask = q->activemask;
+	while (activemask) {
+		int prio = ffz(~activemask);
+		activemask &= ~(1<<prio);
+		skb = cbq_dequeue_prio(sch, prio, fallback);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+
+	PSCHED_GET_TIME(q->now);
+
+	if (q->last_sent)
+		cbq_update(q);
+
+	q->wd_expires = 0;
+
+	skb = cbq_dequeue_1(sch, 0);
+	if (skb)
+		return skb;
+
+	/* All the classes are overlimit.
+	   Search for overlimit class, which is allowed to borrow
+	   and use it as fallback case.
+	 */
+
+#ifdef CBQ_TOPLEVEL_SHARING
+	q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+	skb = cbq_dequeue_1(sch, 1);
+	if (skb)
+		return skb;
+
+	/* No packets in scheduler or nobody wants to give them to us :-(
+	   Sigh... start watchdog timer in the last case. */
+
+	if (sch->q.qlen && q->wd_expires) {
+		if (q->wd_timer.function)
+			del_timer(&q->wd_timer);
+		q->wd_timer.function = cbq_watchdog;
+		q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires);
+		add_timer(&q->wd_timer);
+	}
+	return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+	struct cbq_class *cl;
+
+	for (cl = this->parent; cl; cl = cl->parent) {
+		if (cl->level > this->level)
+			return;
+		cl->level = this->level + 1;
+		this = cl;
+	}
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+	struct cbq_class *cl;
+
+	if (q->quanta[prio] == 0)
+		return;
+
+	for (cl = q->classes[prio]; cl; cl = cl->next) {
+		if (cl->rquantum)
+			cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/
+				q->quanta[prio];
+	}
+}
+
+static __inline__ int cbq_unlink_class(struct cbq_class *this)
+{
+	struct cbq_class *cl, **clp;
+	struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+	for (clp = &q->classes[this->priority]; (cl = *clp) != NULL;
+	     clp = &cl->next) {
+		if (cl == this) {
+			*clp = cl->next;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int cbq_prune(struct cbq_class *this)
+{
+	struct cbq_class *cl;
+	int prio = this->priority;
+	struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+	qdisc_reset(this->q);
+
+	if (cbq_unlink_class(this))
+		return -ENOENT;
+
+	if (this->awake) {
+		struct cbq_class *cl_prev = q->active[prio];
+		do {
+			cl = cl_prev->next_alive;
+			if (cl == this) {
+				cl_prev->next_alive = cl->next_alive;
+
+				if (cl == q->active[prio]) {
+					q->active[prio] = cl;
+					if (cl == q->active[prio]) {
+						q->active[prio] = NULL;
+						q->activemask &= ~(1<<prio);
+						break;
+					}
+				}
+
+				cl = cl->next_alive;
+				cl->deficit += cl->quantum;
+				break;
+			}
+		} while ((cl_prev = cl) != q->active[prio]);
+	}
+
+	--q->nclasses[prio];
+	if (this->rquantum) {
+		q->quanta[prio] -= this->rquantum;
+		cbq_normalize_quanta(q, prio);
+	}
+
+	if (q->fallback_class == this)
+		q->fallback_class = NULL;
+
+	this->parent = NULL;
+	this->borrow = NULL;
+	this->root = this;
+	this->qdisc = NULL;
+	return 0;
+}
+
+static int cbq_graft(struct cbq_class *this, struct cbq_class *parent)
+{
+	struct cbq_class *cl, **clp;
+	int prio = this->priority;
+	struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+	qdisc_reset(this->q);
+
+
+	for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) {
+		if (cl == this)
+			return -EBUSY;
+	}
+
+	cl->next = NULL;
+	*clp = cl;
+	
+	cl->parent = parent;
+	cl->borrow = parent;
+	cl->root = parent ? parent->root : cl;
+
+	++q->nclasses[prio];
+	if (this->rquantum) {
+		q->quanta[prio] += this->rquantum;
+		cbq_normalize_quanta(q, prio);
+	}
+	
+	cbq_adjust_levels(this);
+
+	return 0;
+}
+
+
+static void
+cbq_reset(struct Qdisc* sch)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+	struct cbq_class *cl;
+	int prio;
+
+	q->activemask = 0;
+	q->last_sent = NULL;
+	if (q->wd_timer.function) {
+		del_timer(&q->wd_timer);
+		q->wd_timer.expires = 0;
+		q->wd_timer.function = NULL;
+	}
+#ifdef CBQ_TOPLEVEL_SHARING
+	q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+	for (prio = 0; prio < CBQ_MAXPRIO; prio++) {
+		q->active[prio] = NULL;
+		
+		for (cl = q->classes[prio]; cl; cl = cl->next) {
+			qdisc_reset(cl->q);
+
+			cl->next_alive = NULL;
+			PSCHED_SET_PASTPERFECT(cl->undertime);
+			cl->avgidle = 0;
+			cl->deficit = 0;
+			cl->awake = 0;
+		}
+	}
+}
+
+static void
+cbq_destroy(struct Qdisc* sch)
+{
+	struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+	struct cbq_class *cl, **clp;
+	int prio;
+
+	for (prio = 0; prio < CBQ_MAXPRIO; prio++) {
+		struct cbq_class *cl_head = q->classes[prio];
+		
+		for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) {
+			qdisc_destroy(cl->q);
+			kfree(cl);
+		}
+	}
+}
+
+static int cbq_control(struct Qdisc *sch, void *arg)
+{
+	struct cbq_sched_data *q;
+
+	q = (struct cbq_sched_data *)sch->data;
+
+	/* Do attachment here. It is the last thing to do. */
+
+	return -EINVAL;
+}
+
+static int cbq_init(struct Qdisc *sch, void *arg)
+{
+	struct cbq_sched_data *q;
+	struct cbqctl *ctl = (struct cbqctl*)arg;
+
+	q = (struct cbq_sched_data *)sch->data;
+	init_timer(&q->wd_timer);
+	q->wd_timer.data = (unsigned long)sch;
+#ifdef CBQ_TOPLEVEL_SHARING
+	q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+	return 0;
+}
+
+
+struct Qdisc_ops cbq_ops =
+{
+	NULL,
+	"cbq",
+	0,
+	sizeof(struct cbq_sched_data),
+	cbq_enqueue,
+	cbq_dequeue,
+	cbq_reset,
+	cbq_destroy,
+	cbq_init,
+	cbq_control,
+};
+
+#ifdef MODULE
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&cbq_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c
new file mode 100644
index 000000000..dbc05d31b
--- /dev/null
+++ b/net/sched/sch_csz.c
@@ -0,0 +1,832 @@
+/*
+ * net/sched/sch_csz.c	Clark-Shenker-Zhang scheduler.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/*	Clark-Shenker-Zhang algorithm.
+	=======================================
+
+	SOURCE.
+
+	David D. Clark, Scott Shenker and Lixia Zhang
+	"Supporting Real-Time Applications in an Integrated Services Packet
+	Network: Architecture and Mechanism".
+
+	CBQ presents a flexible universal algorithm for packet scheduling,
+	but it has pretty poor delay characteristics.
+	Round-robin scheduling and link-sharing goals
+	apparently contradict to minimization of network delay and jitter.
+	Moreover, correct handling of predicted flows seems to be
+	impossible in CBQ.
+
+	CSZ presents more precise but less flexible and less efficient
+	approach. As I understand, the main idea is to create
+	WFQ flows for each guaranteed service and to allocate
+	the rest of bandwith to dummy flow-0. Flow-0 comprises
+	the predicted services and the best effort traffic;
+	it is handled by a priority scheduler with the highest
+	priority band allocated	for predicted services, and the rest ---
+	to the best effort packets.
+
+	Note, that in CSZ flows are NOT limited to their bandwidth.
+	It is supposed, that flow passed admission control at the edge
+	of QoS network and it more need no shaping. Any attempt to improve
+	the flow or to shape it to a token bucket at intermediate hops
+	will introduce undesired delays and raise jitter.
+
+	At the moment CSZ is the only scheduler that provides
+	real guaranteed service. Another schemes (including CBQ)
+	do not provide guaranteed delay and randomize jitter.
+	There exists the statement (Sally Floyd), that delay
+	can be estimated by a IntServ compliant formulae.
+	This result is true formally, but it is wrong in principle.
+	At first, it ignores delays introduced by link sharing.
+	And the second (and main) it limits bandwidth,
+	it is fatal flaw.
+
+        ALGORITHM.
+
+	--- Notations.
+
+	$B$ is link bandwidth (bits/sec).
+
+	$I$ is set of all flows, including flow $0$.
+	Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
+	$\sum_{a \in I} r_a = 1$.
+
+	--- Flow model.
+
+	Let $m_a$ is number of backlogged bits in flow $a$.
+	The flow is {\em active }, if $m_a > 0$.
+	This number is discontinuous function of time;
+	when a packet $i$ arrives:
+	\[
+	m_a(t_i+0) - m_a(t_i-0) = L^i,
+	\]
+	where $L^i$ is the length of arrived packet.
+	The flow queue is drained continuously until $m_a == 0$:
+	\[
+	{d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
+	\]
+	I.e. flow rates are their allocated rates proportionally
+	scaled to take all available link bandwidth. Apparently,
+	it is not the only possible policy. F.e. CBQ classes
+	without borrowing would be modelled by:
+	\[
+	{d m_a \over dt} = - B r_a .
+	\]
+	More complicated hierarchical bandwidth allocation
+	policies are possible, but, unfortunately, basic
+	flows equation have simple solution only for proportional
+	scaling.
+
+	--- Departure times.
+
+	We calculate time until the last bit of packet will be sent:
+	\[
+	E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
+	\]
+	where $\delta_a(t)$ is number of bits drained since $t_i$.
+	We have to evaluate $E_a^i$ for all queued packets,
+	then find packet with minimal $E_a^i$ and send it.
+
+	It sounds good, but direct implementation of the algorithm
+	is absolutely infeasible. Luckily, if flow rates
+	are scaled proportionally, the equations have simple solution.
+	
+	The differential equation for $E_a^i$ is
+	\[
+	{d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
+	{ B \over \sum_{b \in A} r_b}
+	\]
+	with initial condition
+	\[
+	E_a^i (t_i) = { m_a(t_i) \over r_a } .
+	\]
+
+	Let's introduce an auxiliary function $R(t)$:
+
+	--- Round number.
+
+	Consider the following model: we rotate over active flows,
+	sending $r_a B$ bits from every flow, so that we send
+	$B \sum_{a \in A} r_a$ bits per round, that takes
+	$\sum_{a \in A} r_a$ seconds.
+	
+	Hence, $R(t)$ (round number) is monotonically increasing
+	linear function	of time when $A$ is not changed
+	\[
+	{ d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
+	\]
+	and it is continuous when $A$ changes.
+
+	The central observation is that the quantity
+	$F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
+	$R(t)$ does not depend on flow, so that $F_a^i$ can be
+	calculated only once on packet arrival, and we need not
+	recalculation of $E$ numbers and resorting queues.
+	Number $F_a^i$ is called finish number of the packet.
+	It is just value of $R(t)$, when the last bit of packet
+	will be sent out.
+
+	Maximal finish number on flow is called finish number of flow
+	and minimal one is "start number of flow".
+	Apparently, flow is active if and only if $F_a \leq R$.
+
+	When packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
+	we calculate number $F_a^i$ as:
+
+	If flow was inactive ($F_a < R$):
+	$F_a^i = R(t) + {L_i \over B r_a}$
+	otherwise
+	$F_a^i = F_a + {L_i \over B r_a}$
+
+	These equations complete the algorithm specification.
+
+	It looks pretty hairy, but there exists a simple
+	procedure for solving these equations.
+	See procedure csz_update(), that is a generalization of
+	algorithm from S. Keshav's thesis Chapter 3
+	"Efficient Implementation of Fair Queeing".
+
+	NOTES.
+
+	* We implement only the simplest variant of CSZ,
+	when flow-0 is explicit 4band priority fifo.
+	It is bad, but we need "peek" operation in addition
+	to "dequeue" to implement complete CSZ.
+	I do not want to make it, until it is not absolutely
+	necessary.
+	
+	* A primitive support for token bucket filtering
+	presents too. It directly contradicts to CSZ, but
+	though the Internet is on the globe ... :-)
+	yet "the edges of the network" really exist.
+	
+	BUGS.
+
+	* Fixed point arithmetic is overcomplicated, suboptimal and even
+	wrong. Check it later.
+*/
+
+
+/* This number is arbitrary */
+
+#define CSZ_MAX_GUARANTEED 16
+
+#define CSZ_FLOW_ID(skb)   (CSZ_MAX_GUARANTEED)
+
+struct csz_head
+{
+	struct csz_head		*snext;
+	struct csz_head		*sprev;
+	struct csz_head		*fnext;
+	struct csz_head		*fprev;
+};
+
+struct csz_flow
+{
+	struct csz_head		*snext;
+	struct csz_head		*sprev;
+	struct csz_head		*fnext;
+	struct csz_head		*fprev;
+
+/* Parameters */
+	unsigned long		rate;	/* Flow rate. Fixed point is at rate_log */
+	unsigned long		*L_tab;	/* Lookup table for L/(B*r_a) values */
+	unsigned long		max_bytes;	/* Maximal length of queue */
+#ifdef CSZ_PLUS_TBF
+	unsigned long		depth;	/* Depth of token bucket, normalized
+					   as L/(B*r_a) */
+#endif
+
+/* Variables */
+#ifdef CSZ_PLUS_TBF
+	unsigned long		tokens; /* Tokens number: usecs */
+	psched_time_t		t_tbf;
+	unsigned long		R_tbf;
+	int			throttled;
+#endif
+	unsigned		peeked;
+	unsigned long		start;	/* Finish number of the first skb */
+	unsigned long		finish;	/* Finish number of the flow */
+
+	struct sk_buff_head	q;	/* FIFO queue */
+};
+
+#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log])
+
+struct csz_sched_data
+{
+/* Parameters */
+	unsigned char	cell_log;	/* 1<<cell_log is quantum of packet size */
+	unsigned char	rate_log;	/* fixed point position for rate;
+					 * really we need not it */
+	unsigned char	R_log;		/* fixed point position for round number */
+	unsigned char	delta_log;	/* 1<<delta_log is maximal timeout in usecs;
+					 * 21 <-> 2.1sec is MAXIMAL value */
+
+/* Variables */
+#ifdef CSZ_PLUS_TBF
+	struct timer_list wd_timer;
+	long		wd_expires;
+#endif
+	psched_time_t	t_c;		/* Time check-point */
+	unsigned long	R_c;		/* R-number check-point	*/
+	unsigned long	rate;		/* Current sum of rates of active flows */
+	struct csz_head	s;		/* Flows sorted by "start" */
+	struct csz_head	f;		/* Flows sorted by "finish"	*/
+
+	struct sk_buff_head	other[4];/* Predicted (0) and the best efforts
+					     classes (1,2,3) */
+	struct csz_flow	flow[CSZ_MAX_GUARANTEED]; /* Array of flows */
+};
+
+/* These routines (csz_insert_finish and csz_insert_start) are
+   the most time consuming part of all the algorithm.
+
+   We insert to sorted list, so that time
+   is linear with respect to number of active flows in the worst case.
+   Note that we have not very large number of guaranteed flows,
+   so that logarithmic algorithms (heap etc.) are useless,
+   they are slower than linear one when length of list <= 32.
+
+   Heap would take sence if we used WFQ for best efforts
+   flows, but SFQ is better choice in this case.
+ */
+
+
+/* Insert flow "this" to the list "b" before
+   flow with greater finish number.
+ */
+
+#if 0
+/* Scan forward */
+extern __inline__ void csz_insert_finish(struct csz_head *b,
+					 struct csz_flow *this)
+{
+	struct csz_head *f = b->fnext;
+	unsigned long finish = this->finish;
+
+	while (f != b) {
+		if (((struct csz_flow*)f)->finish - finish > 0)
+			break;
+		f = f->fnext;
+	}
+	this->fnext = f;
+	this->fprev = f->fprev;
+	this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
+}
+#else
+/* Scan backward */
+extern __inline__ void csz_insert_finish(struct csz_head *b,
+					 struct csz_flow *this)
+{
+	struct csz_head *f = b->fprev;
+	unsigned long finish = this->finish;
+
+	while (f != b) {
+		if (((struct csz_flow*)f)->finish - finish <= 0)
+			break;
+		f = f->fprev;
+	}
+	this->fnext = f->fnext;
+	this->fprev = f;
+	this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
+}
+#endif
+
+/* Insert flow "this" to the list "b" before
+   flow with greater start number.
+ */
+
+extern __inline__ void csz_insert_start(struct csz_head *b,
+					struct csz_flow *this)
+{
+	struct csz_head *f = b->snext;
+	unsigned long start = this->start;
+
+	while (f != b) {
+		if (((struct csz_flow*)f)->start - start > 0)
+			break;
+		f = f->snext;
+	}
+	this->snext = f;
+	this->sprev = f->sprev;
+	this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
+}
+
+
+/* Calculate and return current round number.
+   It is another time consuming part, but
+   it is impossible to avoid it.
+
+   Fixed point arithmetic is not ... does not ... Well, it is just CRAP.
+ */
+
+static unsigned long csz_update(struct Qdisc *sch)
+{
+	struct csz_sched_data	*q = (struct csz_sched_data*)sch->data;
+	struct csz_flow 	*a;
+	unsigned long		F;
+	unsigned long		tmp;
+	psched_time_t		now;
+	unsigned long		delay;
+	unsigned long		R_c;
+
+	PSCHED_GET_TIME(now);
+	delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
+
+	if (delay>>q->delta_log) {
+do_reset:
+		/* Delta is too large.
+		   It is possible if MTU/BW > 1<<q->delta_log
+		   (i.e. configuration error) or because of hardware
+		   fault. We have no choice...
+		 */
+		qdisc_reset(sch);
+		return 0;
+	}
+
+	q->t_c = now;
+
+	for (;;) {
+		a = (struct csz_flow*)q->f.fnext;
+
+		/* No more active flows. Reset R and exit. */
+		if (a == (struct csz_flow*)&q->f) {
+#ifdef CSZ_DEBUG
+			if (q->rate) {
+				printk("csz_update: rate!=0 on inactive csz\n");
+				q->rate = 0;
+			}
+#endif
+			q->R_c = 0;
+			return 0;
+		}
+
+		F = a->finish;
+
+#ifdef CSZ_DEBUG
+		if (q->rate == 0) {
+			printk("csz_update: rate=0 on active csz\n");
+			goto do_reset;
+		}
+#endif
+
+		/*
+		 *           tmp = (t - q->t_c)/q->rate;
+		 */
+
+		tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
+
+		tmp += q->R_c;
+
+		/* OK, this flow (and all flows with greater
+		   finish numbers) is still active */
+		if (F - tmp > 0)
+			break;
+
+		/* It is more not active */
+
+		a->fprev->fnext = a->fnext;
+		a->fnext->fprev = a->fprev;
+
+		/*
+		 * q->t_c += (F - q->R_c)*q->rate
+		 */
+
+		tmp = ((F-q->R_c)*q->rate)<<q->R_log;
+		R_c = F;
+		q->rate -= a->rate;
+
+		if (delay - tmp >= 0) {
+			delay -= tmp;
+			continue;
+		}
+		delay = 0;
+	}
+
+	q->R_c = tmp;
+	return tmp;
+}
+
+static int
+csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	unsigned flow_id = CSZ_FLOW_ID(skb);
+	unsigned long R;
+	int prio;
+	struct csz_flow *this;
+
+	if (flow_id >= CSZ_MAX_GUARANTEED) {
+		prio = flow_id - CSZ_MAX_GUARANTEED;
+		flow_id = 0;
+	}
+
+	this = &q->flow[flow_id];
+	if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) {
+		kfree_skb(skb, FREE_WRITE);
+		return 0;
+	}
+
+	R = csz_update(sch);
+
+	if (this->finish - R >= 0) {
+		/* It was active */
+		this->finish += L2R(q,this,skb->len);
+	} else {
+		/* It is inactive; activate it */
+		this->finish = R + L2R(q,this,skb->len);
+		q->rate += this->rate;
+		csz_insert_finish(&q->f, this);
+	}
+
+	/* If this flow was empty, remember start number
+	   and insert it into start queue */
+	if (this->q.qlen == 0) {
+		this->start = this->finish;
+		csz_insert_start(&q->s, this);
+	}
+	if (flow_id)
+		skb_queue_tail(&this->q, skb);
+	else
+		skb_queue_tail(&q->other[prio], skb);
+	sch->q.qlen++;
+	return 1;
+}
+
+static __inline__ struct sk_buff *
+skb_dequeue_best(struct csz_sched_data * q)
+{
+	int i;
+	struct sk_buff *skb;
+
+	for (i=0; i<4; i++) {
+		skb = skb_dequeue(&q->other[i]);
+		if (skb) {
+			q->flow[0].q.qlen--;
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+static __inline__ struct sk_buff *
+skb_peek_best(struct csz_sched_data * q)
+{
+	int i;
+	struct sk_buff *skb;
+
+	for (i=0; i<4; i++) {
+		skb = skb_peek(&q->other[i]);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+#ifdef CSZ_PLUS_TBF
+
+static void csz_watchdog(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc*)arg;
+	struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
+
+	q->wd_timer.expires = 0;
+	q->wd_timer.function = NULL;
+
+	qdisc_wakeup(sch->dev);
+}
+
+static __inline__ void
+csz_move_queue(struct csz_flow *this, long delta)
+{
+	this->fprev->fnext = this->fnext;
+	this->fnext->fprev = this->fprev;
+
+	this->start += delta;
+	this->finish += delta;
+
+	csz_insert_finish(this);
+}
+
+static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
+					struct csz_flow *this,
+					struct sk_buff *skb)
+{
+	long toks;
+	long shift;
+	psched_time_t now;
+
+	PSCHED_GET_TIME(now);
+
+	toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
+
+	shift = 0;
+	if (this->throttled) {
+		/* Remember aposteriory delay */
+
+		unsigned long R = csz_update(q);
+		shift = R - this->R_tbf;
+		this->R_tbf = R;
+	}
+
+	if (toks >= 0) {
+		/* Now we have enough tokens to proceed */
+
+		this->tokens = toks <= this->depth ? toks ? this->depth;
+		this->t_tbf = now;
+	
+		if (!this->throttled)
+			return 1;
+
+		/* Flow was throttled. Update its start&finish numbers
+		   with delay calculated aposteriori.
+		 */
+
+		this->throttled = 0;
+		if (shift > 0)
+			csz_move_queue(this, shift);
+		return 1;
+	}
+
+	if (!this->throttled) {
+		/* Flow has just been throttled; remember
+		   current round number to calculate aposteriori delay
+		 */
+		this->throttled = 1;
+		this->R_tbf = csz_update(q);
+	}
+
+	/* Move all the queue to the time when it will be allowed to send.
+	   We should translate time to round number, but it is impossible,
+	   so that we made the most conservative estimate i.e. we suppose
+	   that only this flow is active and, hence, R = t.
+	   Really toks <= R <= toks/r_a.
+
+	   This apriory shift in R will be adjusted later to reflect
+	   real delay. We cannot avoid it because of:
+	   - throttled flow continues to be active from the viewpoint
+	     of CSZ, so that it would acquire highest priority,
+	     if you not adjusted start numbers.
+	   - Eventually, finish number would become less than round
+	     number and flow were declared inactive.
+	 */
+
+	toks = -toks;
+
+	/* Remeber, that we should start watchdog */
+	if (toks < q->wd_expires)
+		q->wd_expires = toks;
+
+	toks >>= q->R_log;
+	shift += toks;
+	if (shift > 0) {
+		this->R_tbf += toks;
+		csz_move_queue(this, shift);
+	}
+	csz_insert_start(this);
+	return 0;
+}
+#endif
+
+
+static struct sk_buff *
+csz_dequeue(struct Qdisc* sch)
+{
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	struct sk_buff *skb;
+	struct csz_flow *this;
+
+#ifdef CSZ_PLUS_TBF
+	q->wd_expires = 0;
+#endif
+	this = (struct csz_flow*)q->s.snext;
+
+	while (this != (struct csz_flow*)&q->s) {
+
+		/* First of all: unlink from start list */
+		this->sprev->snext = this->snext;
+		this->snext->sprev = this->sprev;
+
+		if (this != &q->flow[0]) {	/* Guaranteed flow */
+			skb = __skb_dequeue(&this->q);
+			if (skb) {
+#ifdef CSZ_PLUS_TBF
+				if (this->depth) {
+					if (!csz_enough_tokens(q, this, skb))
+						continue;
+				}
+#endif
+				if (this->q.qlen) {
+					struct sk_buff *nskb = skb_peek(&this->q);
+					this->start += L2R(q,this,nskb->len);
+					csz_insert_start(&q->s, this);
+				}
+				sch->q.qlen--;
+				return skb;
+			}
+		} else {	/* Predicted or best effort flow */
+			skb = skb_dequeue_best(q);
+			if (skb) {
+				unsigned peeked = this->peeked;
+				this->peeked = 0;
+
+				if (--this->q.qlen) {
+					struct sk_buff *nskb;
+					unsigned dequeued = L2R(q,this,skb->len);
+
+					/* We got not the same thing that
+					   peeked earlier; adjust start number
+					   */
+					if (peeked != dequeued && peeked)
+						this->start += dequeued - peeked;
+
+					nskb = skb_peek_best(q);
+					peeked = L2R(q,this,nskb->len);
+					this->start += peeked;
+					this->peeked = peeked;
+					csz_insert_start(&q->s, this);
+				}
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+#ifdef CSZ_PLUS_TBF
+	/* We are about to return no skb.
+	   Schedule watchdog timer, if it occured because of shaping.
+	 */
+	if (q->wd_expires) {
+		if (q->wd_timer.function)
+			del_timer(&q->wd_timer);
+		q->wd_timer.function = csz_watchdog;
+		q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires);
+		add_timer(&q->wd_timer);
+	}
+#endif
+	return NULL;
+}
+
+static void
+csz_reset(struct Qdisc* sch)
+{
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	struct sk_buff *skb;
+	int    i;
+
+	for (i=0; i<4; i++)
+		while ((skb=skb_dequeue(&q->other[i])) != NULL)
+			kfree_skb(skb, 0);
+
+	for (i=0; i<CSZ_MAX_GUARANTEED; i++) {
+		struct csz_flow *this = q->flow + i;
+		while ((skb = skb_dequeue(&this->q)) != NULL)
+			kfree_skb(skb, FREE_WRITE);
+		this->snext = this->sprev =
+		this->fnext = this->fprev = (struct csz_head*)this;
+		this->start = this->finish = 0;
+	}
+	q->s.snext = q->s.sprev = &q->s;
+	q->f.fnext = q->f.fprev = &q->f;
+	q->R_c = 0;
+#ifdef CSZ_PLUS_TBF
+	PSCHED_GET_TIME(&q->t_tbf);
+	q->tokens = q->depth;
+	if (q->wd_timer.function) {
+		del_timer(&q->wd_timer);
+		q->wd_timer.function = NULL;
+	}
+#endif
+	sch->q.qlen = 0;
+}
+
+static void
+csz_destroy(struct Qdisc* sch)
+{
+/*
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	int	i;
+
+	for (i=0; i<4; i++)
+		qdisc_destroy(q->other[i]);
+ */
+}
+
+static int csz_init(struct Qdisc *sch, void *arg)
+{
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	struct cszinitctl *ctl = (struct cszinitctl*)arg;
+	int    i;
+
+	for (i=0; i<4; i++)
+		skb_queue_head_init(&q->other[i]);
+
+	for (i=0; i<CSZ_MAX_GUARANTEED; i++) {
+		struct csz_flow *this = q->flow + i;
+		skb_queue_head_init(&this->q);
+		this->snext = this->sprev =
+		this->fnext = this->fprev = (struct csz_head*)this;
+		this->start = this->finish = 0;
+	}
+	q->s.snext = q->s.sprev = &q->s;
+	q->f.fnext = q->f.fprev = &q->f;
+	q->R_c = 0;
+#ifdef CSZ_PLUS_TBF
+	init_timer(&q->wd_timer);
+	q->wd_timer.data = (unsigned long)sch;
+#endif
+	if (ctl) {
+		if (ctl->flows != CSZ_MAX_GUARANTEED)
+			return -EINVAL;
+		q->cell_log = ctl->cell_log;
+	}
+	return 0;
+}
+
+static int csz_control(struct Qdisc *sch, struct pschedctl *gctl)
+{
+/*
+	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+	struct cszctl *ctl = (struct cszctl*)gctl->arg;
+	struct sk_buff *skb;
+	int    i;
+
+	if (op == PSCHED_TC_ATTACH) {
+		
+	}
+*/
+	return 0;
+}
+
+
+
+
+struct Qdisc_ops csz_ops =
+{
+	NULL,
+	"csz",
+	0,
+	sizeof(struct csz_sched_data),
+	csz_enqueue,
+	csz_dequeue,
+	csz_reset,
+	csz_destroy,
+	csz_init,
+	csz_control,
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&csz_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000..8134baf16
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,179 @@
+/*
+ * net/sched/sch_fifo.c	Simple FIFO "scheduler"
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* 1 band FIFO pseudo-"scheduler" */
+
+struct fifo_sched_data
+{
+	int	qmaxbytes;
+	int	qmaxlen;
+	int	qbytes;
+};
+
+static int
+bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+
+	if (q->qbytes <= q->qmaxbytes) {
+		skb_queue_tail(&sch->q, skb);
+		q->qbytes += skb->len;
+		return 0;
+	}
+	kfree_skb(skb, FREE_WRITE);
+	return 1;
+}
+
+static struct sk_buff *
+bfifo_dequeue(struct Qdisc* sch)
+{
+	struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	skb = skb_dequeue(&sch->q);
+	if (skb)
+		q->qbytes -= skb->len;
+	return skb;
+}
+
+static void
+bfifo_reset(struct Qdisc* sch)
+{
+	struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	while((skb=skb_dequeue(&sch->q)) != NULL) {
+		q->qbytes -= skb->len;
+		kfree_skb(skb,FREE_WRITE);
+	}
+	if (q->qbytes) {
+		printk("fifo_reset: qbytes=%d\n", q->qbytes);
+		q->qbytes = 0;
+	}
+}
+
+static int
+pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+
+	if (sch->q.qlen <= q->qmaxlen) {
+		skb_queue_tail(&sch->q, skb);
+		return 0;
+	}
+	kfree_skb(skb, FREE_WRITE);
+	return 1;
+}
+
+static struct sk_buff *
+pfifo_dequeue(struct Qdisc* sch)
+{
+	return skb_dequeue(&sch->q);
+}
+
+static void
+pfifo_reset(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+
+	while((skb=skb_dequeue(&sch->q))!=NULL)
+		kfree_skb(skb,FREE_WRITE);
+}
+
+
+static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */)
+{
+	struct fifo_sched_data *q;
+/*
+	struct device *dev = sch->dev;
+ */
+
+	q = (struct fifo_sched_data *)sch->data;
+/*
+	if (pkts<0)
+		pkts = dev->tx_queue_len;
+	if (bytes<0)
+		bytes = pkts*dev->mtu;
+	q->qmaxbytes = bytes;
+	q->qmaxlen = pkts;
+ */
+	return 0;
+}
+
+struct Qdisc_ops pfifo_ops =
+{
+	NULL,
+	"pfifo",
+	0,
+	sizeof(struct fifo_sched_data),
+	pfifo_enqueue,
+	pfifo_dequeue,
+	pfifo_reset,
+	NULL,
+	fifo_init,
+};
+
+struct Qdisc_ops bfifo_ops =
+{
+	NULL,
+	"pfifo",
+	0,
+	sizeof(struct fifo_sched_data),
+	bfifo_enqueue,
+	bfifo_dequeue,
+	bfifo_reset,
+	NULL,
+	fifo_init,
+};
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&pfifo_ops);
+	if (err == 0) {
+		err = register_qdisc(&bfifo_ops);
+		if (err)
+			unregister_qdisc(&pfifo_ops);
+	}
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000..83aa8d10e
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,541 @@
+/*
+ * net/sched/sch_generic.c	Generic packet scheduler routines.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+struct Qdisc_head qdisc_head = { &qdisc_head };
+
+static struct Qdisc_ops *qdisc_base = NULL;
+
+/* NOTES.
+
+   Every discipline has two major routines: enqueue and dequeue.
+
+   ---dequeue
+
+   dequeue usually returns a skb to send. It is allowed to return NULL,
+   but it does not mean that queue is empty, it just means that
+   discipline does not want to send anything this time.
+   Queue is really empty if q->q.qlen == 0.
+   For complicated disciplines with multiple queues q->q is not
+   real packet queue, but however q->q.qlen must be valid.
+
+   ---enqueue
+
+   enqueue returns number of enqueued packets i.e. this number is 1,
+   if packet was enqueued sucessfully and <1 if something (not
+   necessary THIS packet) was dropped.
+
+ */
+
+int register_qdisc(struct Qdisc_ops *qops)
+{
+	struct Qdisc_ops *q, **qp;
+	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+		if (strcmp(qops->id, q->id) == 0)
+			return -EEXIST;
+	qops->next = NULL;
+	qops->refcnt = 0;
+	*qp = qops;
+	return 0;
+}
+
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+	struct Qdisc_ops *q, **qp;
+	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+		if (q == qops)
+			break;
+	if (!q)
+		return -ENOENT;
+	*qp = q->next;
+	return 0;
+}
+
+struct Qdisc *qdisc_lookup(int handle)
+{
+	return NULL;
+}
+
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+   in all curcumstances. It is difficult to invent anything more
+   fast or cheap.
+ */
+
+static int
+noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+{
+	kfree_skb(skb, FREE_WRITE);
+	return 0;
+}
+
+static struct sk_buff *
+noop_dequeue(struct Qdisc * qdisc)
+{
+	return NULL;
+}
+
+struct Qdisc noop_qdisc =
+{
+        { NULL }, 
+	noop_enqueue,
+	noop_dequeue,
+};
+
+struct Qdisc noqueue_qdisc =
+{
+        { NULL }, 
+	NULL,
+	NULL,
+};
+
+
+/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */
+
+static int
+pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 };
+	struct sk_buff_head *list;
+
+	list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7];
+
+	if (list->qlen <= skb->dev->tx_queue_len) {
+		skb_queue_tail(list, skb);
+		return 1;
+	}
+	qdisc->dropped++;
+	kfree_skb(skb, FREE_WRITE);
+	return 0;
+}
+
+static struct sk_buff *
+pfifo_fast_dequeue(struct Qdisc* qdisc)
+{
+	int prio;
+	struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+	struct sk_buff *skb;
+
+	for (prio = 0; prio < 3; prio++, list++) {
+		skb = skb_dequeue(list);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+static void
+pfifo_fast_reset(struct Qdisc* qdisc)
+{
+	int prio;
+	struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+
+	for (prio=0; prio < 3; prio++)
+		skb_queue_purge(list+prio);
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, void *arg)
+{
+	int i;
+	struct sk_buff_head *list;
+
+	list = ((struct sk_buff_head*)qdisc->data);
+
+	for(i=0; i<3; i++)
+		skb_queue_head_init(list+i);
+
+	return 0;
+}
+
+static struct Qdisc_ops pfifo_fast_ops =
+{
+	NULL,
+	"pfifo_fast",
+	1,
+	3 * sizeof(struct sk_buff_head),
+	pfifo_fast_enqueue,
+	pfifo_fast_dequeue,
+	pfifo_fast_reset,
+	NULL,
+	pfifo_fast_init
+};
+
+static struct Qdisc *
+qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg)
+{
+	struct Qdisc *sch;
+	int size = sizeof(*sch) + ops->priv_size;
+
+	sch = kmalloc(size, GFP_KERNEL);
+	if (!sch)
+		return NULL;
+	memset(sch, 0, size);
+
+	skb_queue_head_init(&sch->q);
+	skb_queue_head_init(&sch->failure_q);
+	sch->ops = ops;
+	sch->enqueue = ops->enqueue;
+	sch->dequeue = ops->dequeue;
+	sch->dev = dev;
+	if (ops->init && ops->init(sch, arg))
+		return NULL;
+	ops->refcnt++;
+	return sch;
+}
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+	struct Qdisc_ops *ops = qdisc->ops;
+	if (ops) {
+		start_bh_atomic();
+		if (ops->reset)
+			ops->reset(qdisc);
+		skb_queue_purge(&qdisc->failure_q);
+		end_bh_atomic();
+	}
+}
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+	struct Qdisc_ops *ops = qdisc->ops;
+	if (ops) {
+		start_bh_atomic();
+		if (ops->reset)
+			ops->reset(qdisc);
+		if (ops->destroy)
+			ops->destroy(qdisc);
+		skb_queue_purge(&qdisc->failure_q);
+		ops->refcnt--;
+		end_bh_atomic();
+		kfree(qdisc);
+	}
+}
+
+static void dev_do_watchdog(unsigned long dummy);
+
+static struct timer_list dev_watchdog =
+	{ NULL, NULL, 0L, 0L, &dev_do_watchdog };
+
+static void dev_do_watchdog(unsigned long dummy)
+{
+	struct Qdisc_head *h;
+
+	for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) {
+		struct Qdisc *q = (struct Qdisc*)h;
+		struct device *dev = q->dev;
+		if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) {
+			qdisc_restart(dev);
+		}
+	}
+	dev_watchdog.expires = jiffies + 5*HZ;
+	add_timer(&dev_watchdog);
+}
+
+
+void dev_activate(struct device *dev)
+{
+	/* No queueing discipline is attached to device;
+	   create default one i.e. pfifo_fast for devices,
+	   which need queueing and noqueue_qdisc for
+	   virtual intrfaces
+	 */
+
+	if (dev->qdisc_sleeping == &noop_qdisc) {
+		if (dev->tx_queue_len) {
+			struct Qdisc *qdisc;
+			qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL);
+			if (qdisc == NULL)
+				return;
+			dev->qdisc_sleeping = qdisc;
+		} else
+			dev->qdisc_sleeping = &noqueue_qdisc;
+	}
+
+	start_bh_atomic();
+	if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
+		dev->qdisc->tx_timeo = 5*HZ;
+		dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo;
+		if (!dev_watchdog.expires) {
+			dev_watchdog.expires = jiffies + 5*HZ;
+			add_timer(&dev_watchdog);
+		}
+	}
+	end_bh_atomic();
+}
+
+void dev_deactivate(struct device *dev)
+{
+	struct Qdisc *qdisc;
+
+	start_bh_atomic();
+
+	qdisc = dev->qdisc;
+	dev->qdisc = &noop_qdisc;
+
+	qdisc_reset(qdisc);
+
+	if (qdisc->h.forw) {
+		struct Qdisc_head **hp, *h;
+
+		for (hp = &qdisc_head.forw; (h = *hp) != &qdisc_head; hp = &h->forw) {
+			if (h == &qdisc->h) {
+				*hp = h->forw;
+				break;
+			}
+		}
+	}
+
+	end_bh_atomic();
+}
+
+void dev_init_scheduler(struct device *dev)
+{
+	dev->qdisc = &noop_qdisc;
+	dev->qdisc_sleeping = &noop_qdisc;
+}
+
+void dev_shutdown(struct device *dev)
+{
+	struct Qdisc *qdisc;
+
+	start_bh_atomic();
+	qdisc = dev->qdisc_sleeping;
+	dev->qdisc_sleeping = &noop_qdisc;
+	qdisc_destroy(qdisc);	
+	end_bh_atomic();
+}
+
+void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc)
+{
+	struct Qdisc *oqdisc;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	start_bh_atomic();
+	oqdisc = dev->qdisc_sleeping;
+
+	/* Destroy old scheduler */
+	if (oqdisc)
+		qdisc_destroy(oqdisc);
+
+	/* ... and attach new one */
+	dev->qdisc_sleeping = qdisc;
+	dev->qdisc = &noop_qdisc;
+	end_bh_atomic();
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+}
+
+/* Kick the queue "q".
+   Note, that this procedure is called by watchdog timer, so that
+   we do not check dev->tbusy flag here.
+
+   Returns:  0  - queue is empty.
+            >0  - queue is not empty, but throttled.
+	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
+
+   NOTE: Called only from NET BH
+*/
+
+
+int qdisc_restart(struct device *dev)
+{
+	struct Qdisc *q = dev->qdisc;
+	struct sk_buff *skb;
+
+	skb = skb_dequeue(&q->failure_q);
+	if (!skb) {
+		skb = q->dequeue(q);
+		if (netdev_nit && skb)
+			dev_queue_xmit_nit(skb,dev);
+	}
+	if (skb) {
+		if (dev->hard_start_xmit(skb, dev) == 0) {
+			q->tx_last = jiffies;
+			return -1;
+		}
+#if 0
+		if (net_ratelimit())
+			printk(KERN_DEBUG "netdevice %s defers output.\n", dev->name);
+#endif
+		skb_queue_head(&q->failure_q, skb);
+		return -1;
+	}
+	return q->q.qlen;
+}
+
+void qdisc_run_queues(void)
+{
+	struct Qdisc_head **hp, *h;
+
+	hp = &qdisc_head.forw;
+	while ((h = *hp) != &qdisc_head) {
+		int res = -1;
+		struct Qdisc *q = (struct Qdisc*)h;
+		struct device *dev = q->dev;
+
+		while (!dev->tbusy && (res = qdisc_restart(dev)) < 0)
+			/* NOTHING */;
+
+		/* The explanation is necessary here.
+		   qdisc_restart called dev->hard_start_xmit,
+		   if device is virtual, it could trigger one more
+		   dev_queue_xmit and new device could appear
+		   in active chain. In this case we cannot unlink
+		   empty queue, because we lost back pointer.
+		   No problem, we will unlink it during the next round.
+		 */
+
+		if (res == 0 && *hp == h) {
+			*hp = h->forw;
+			h->forw = NULL;
+			continue;
+		}
+		hp = &h->forw;
+	}
+}
+
+
+int tc_init(struct pschedctl *pctl)
+{
+	struct Qdisc *q;
+	struct Qdisc_ops *qops;
+
+	if (pctl->handle) {
+		q = qdisc_lookup(pctl->handle);
+		if (q == NULL)
+			return -ENOENT;
+		qops = q->ops;
+		if (pctl->ifindex && q->dev->ifindex != pctl->ifindex)
+			return -EINVAL;
+	}
+	return -EINVAL;
+}
+
+int tc_destroy(struct pschedctl *pctl)
+{
+	return -EINVAL;
+}
+
+int tc_attach(struct pschedctl *pctl)
+{
+	return -EINVAL;
+}
+
+int tc_detach(struct pschedctl *pctl)
+{
+	return -EINVAL;
+}
+
+
+int psched_ioctl(void *arg)
+{
+	struct pschedctl ctl;
+	struct pschedctl *pctl = &ctl;
+	int err;
+
+	if (copy_from_user(&ctl, arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.arglen > 0) {
+		pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL);
+		if (pctl == NULL)
+			return -ENOBUFS;
+		memcpy(pctl, &ctl, sizeof(ctl));
+		if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) {
+			kfree(pctl);
+			return -EFAULT;
+		}
+	}
+
+	rtnl_lock();
+
+	switch (ctl.command) {
+	case PSCHED_TC_INIT:
+		err = tc_init(pctl);
+		break;
+	case PSCHED_TC_DESTROY:
+		err = tc_destroy(pctl);
+		break;
+	case PSCHED_TC_ATTACH:
+		err = tc_attach(pctl);
+		break;
+	case PSCHED_TC_DETACH:
+		err = tc_detach(pctl);
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	rtnl_unlock();
+
+	if (pctl != &ctl)
+		kfree(pctl);
+	return err;
+}
+
+__initfunc(int pktsched_init(void))
+{
+#define INIT_QDISC(name) { \
+          extern struct Qdisc_ops name##_ops; \
+          register_qdisc(&##name##_ops); \
+	}
+
+	skb_queue_head_init(&noop_qdisc.failure_q);
+	skb_queue_head_init(&noqueue_qdisc.failure_q);
+
+	register_qdisc(&pfifo_fast_ops);
+#ifdef CONFIG_NET_SCH_CBQ
+	INIT_QDISC(cbq);
+#endif
+#ifdef CONFIG_NET_SCH_CSZ
+	INIT_QDISC(csz);
+#endif
+#ifdef CONFIG_NET_SCH_RED
+	INIT_QDISC(red);
+#endif
+#ifdef CONFIG_NET_SCH_SFQ
+	INIT_QDISC(sfq);
+#endif
+#ifdef CONFIG_NET_SCH_TBF
+	INIT_QDISC(tbf);
+#endif
+#ifdef CONFIG_NET_SCH_PFIFO
+	INIT_QDISC(pfifo);
+	INIT_QDISC(bfifo);
+#endif
+#ifdef CONFIG_NET_SCH_PRIO
+	INIT_QDISC(prio);
+#endif
+	return 0;
+}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000..a3806eda4
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,146 @@
+/*
+ * net/sched/sch_prio.c	Simple 3-band priority "scheduler".
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* New N-band generic scheduler */
+
+struct prio_sched_data
+{
+	int qbytes;
+	int bands;
+	u8  prio2band[8];
+	struct Qdisc *queues[8];
+};
+
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+	int prio = q->prio2band[skb->priority&7];
+	struct Qdisc *qdisc;
+
+	qdisc = q->queues[prio];
+	if (qdisc->enqueue(skb, qdisc) == 0) {
+		q->qbytes += skb->len;
+		sch->q.qlen++;
+		return 0;
+	}
+	return 1;
+}
+
+static struct sk_buff *
+prio_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+	int prio;
+	struct Qdisc *qdisc;
+
+	for (prio = 0; prio < q->bands; prio++) {
+		qdisc = q->queues[prio];
+		skb = qdisc->dequeue(qdisc);
+		if (skb) {
+			q->qbytes -= skb->len;
+			sch->q.qlen--;
+			return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static void
+prio_reset(struct Qdisc* sch)
+{
+	int prio;
+	struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+
+	for (prio=0; prio<q->bands; prio++)
+		qdisc_reset(q->queues[prio]);
+	q->qbytes = 0;
+}
+
+static void
+prio_destroy(struct Qdisc* sch)
+{
+	int prio;
+	struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+
+	for (prio=0; prio<q->bands; prio++) {
+		qdisc_destroy(q->queues[prio]);
+		q->queues[prio] = &noop_qdisc;
+	}
+}
+
+static int prio_init(struct Qdisc *sch, void *arg)
+{
+	const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 };
+	struct prio_sched_data *q;
+	int i;
+
+	q = (struct prio_sched_data *)sch->data;
+	q->bands = 3;
+	memcpy(q->prio2band, prio2band, sizeof(prio2band));
+	for (i=0; i<q->bands; i++)
+		q->queues[i] = &noop_qdisc;
+	return 0;
+}
+
+struct Qdisc_ops prio_ops =
+{
+	NULL,
+	"prio",
+	0,
+	sizeof(struct prio_sched_data),
+	prio_enqueue,
+	prio_dequeue,
+	prio_reset,
+	prio_destroy,
+	prio_init,
+};
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&prio_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000..fd3ee43ac
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,303 @@
+/*
+ * net/sched/sch_red.c	Random Early Detection scheduler.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/*	Random Early Detection (RED) algorithm.
+	=======================================
+
+	Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
+	for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
+
+	This file codes a "divisionless" version of RED algorithm
+	written down in Fig.17 of the paper.
+
+Short description.
+------------------
+
+	When new packet arrives we calculate average queue length:
+
+	avg = (1-W)*avg + W*current_queue_len,
+
+	W is filter time constant (choosen as 2^(-Wlog)), controlling
+	inertia of algorithm. To allow larger bursts, W should be
+	decreased.
+
+	if (avg > th_max) -> packet marked (dropped).
+	if (avg < th_min) -> packet passes.
+	if (th_min < avg < th_max) we calculate probability:
+
+	Pb = max_P * (avg - th_min)/(th_max-th_min)
+
+	and mark (drop) packet with this probability.
+	Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
+	max_P should be small (not 1!).
+
+	NB.	SF&VJ assumed that Pb[avg] is linear function. I think it
+	        is wrong. I'd make:
+		P[th_min] = 0, P[th_max] = 1;
+		dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number.
+
+	I choose max_P as a number between 0.01 and 0.1, so that
+	C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log)
+
+	Parameters, settable by user (with default values):
+
+	qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max
+	                 to allow packet bursts. This parameter does not
+			 affect algorithm behaviour and can be chosen
+			 arbitrarily high (well, less than ram size)
+			 Really, this limit will never be achieved
+			 if RED works correctly.
+	qth_min=32K
+	qth_max=128K   - qth_max should be at least 2*qth_min
+	Wlog=8	       - log(1/W).
+	Alog=Wlog      - fixed point position in th_min and th_max.
+	Rlog=10
+	C1log=24       - C1log = trueC1log+Alog-Rlog
+	                 so that trueC1log=22 and max_P~0.02
+	
+
+NOTES:
+
+Upper bound on W.
+-----------------
+
+	If you want to allow bursts of L packets of size S,
+	you should choose W:
+
+	L + 1 -th_min/S < (1-(1-W)^L)/W
+
+	For th_min/S = 32
+
+	log(W)	L
+	-1	33
+	-2	35
+	-3	39
+	-4	46
+	-5	57
+	-6	75
+	-7	101
+	-8	135
+	-9	190
+	etc.
+ */
+
+struct red_sched_data
+{
+/* Parameters */
+	unsigned long	qmaxbytes;	/* HARD maximal queue length	*/
+	unsigned long	qth_min;	/* Min average length threshold: A scaled */
+	unsigned long	qth_max;	/* Max average length threshold: A scaled */
+	char		Alog;		/* Point position in average lengths */
+	char		Wlog;		/* log(W)		*/
+	char		Rlog;		/* random number bits	*/
+	char		C1log;		/* log(1/C1)		*/
+	char		Slog;
+	char		Stab[256];
+
+/* Variables */
+	unsigned long	qbytes;		/* Queue length in bytes	*/
+	unsigned long	qave;		/* Average queue length: A scaled */
+	int		qcount;		/* Packets since last random number generation */
+	unsigned	qR;		/* Cached random number [0..1<Rlog) */
+	psched_time_t	qidlestart;	/* Start of idle period		*/
+};
+
+/* Stolen from igmp.c. */
+
+static __inline__ unsigned red_random(int log)
+{
+	static unsigned long seed=152L;
+	seed=seed*69069L+1;
+	return (seed^jiffies)&((1<<log)-1);
+}
+
+static int
+red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct red_sched_data *q = (struct red_sched_data *)sch->data;
+
+	psched_time_t now;
+
+	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+		long us_idle;
+		PSCHED_SET_PASTPERFECT(q->qidlestart);
+		PSCHED_GET_TIME(now);
+		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<<q->Slog)-1, 0);
+
+/* It is wrong, but I do not think that SF+VJ proposal is reasonable
+   and did not invented anything more clever 8)
+
+   The problem: ideally, average length queue recalcultion should
+   be done over constant clock intervals. It is too expensive, so that
+   calculation is driven by outgoing packets.
+   When queue is idle we have to model this clock by hands.
+
+   SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size
+   dummy packets as burst after idle time, i.e.
+
+          q->qave *= (1-W)^m
+
+   It is apparently overcomplicated solution (f.e. we have to precompute
+   a table to make this calculation for reasonable time)
+   I believe, that a simpler model may be used here,
+   but it is field for experiments.
+*/
+		q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF];
+	}
+
+	q->qave += ((q->qbytes<<q->Alog) - q->qave) >> q->Wlog;
+
+	if (q->qave < q->qth_min) {
+enqueue:
+		q->qcount = -1;
+		if (q->qbytes <= q->qmaxbytes) {
+			skb_queue_tail(&sch->q, skb);
+			q->qbytes += skb->len;
+			return 1;
+		}
+drop:
+		kfree_skb(skb, FREE_WRITE);
+		return 0;
+	}
+	if (q->qave >= q->qth_max) {
+		q->qcount = -1;
+		goto drop;
+	}
+	q->qcount++;
+	if (q->qcount++) {
+		if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR)
+			goto enqueue;
+		q->qcount = 0;
+		q->qR = red_random(q->Rlog);
+		goto drop;
+	}
+	q->qR = red_random(q->Rlog);
+	goto enqueue;
+}
+
+static struct sk_buff *
+red_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct red_sched_data *q = (struct red_sched_data *)sch->data;
+
+	skb = skb_dequeue(&sch->q);
+	if (skb) {
+		q->qbytes -= skb->len;
+		return skb;
+	}
+	PSCHED_GET_TIME(q->qidlestart);
+	return NULL;
+}
+
+static void
+red_reset(struct Qdisc* sch)
+{
+	struct red_sched_data *q = (struct red_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	while((skb=skb_dequeue(&sch->q))!=NULL) {
+		q->qbytes -= skb->len;
+		kfree_skb(skb,FREE_WRITE);
+	}
+	if (q->qbytes) {
+		printk("red_reset: qbytes=%lu\n", q->qbytes);
+		q->qbytes = 0;
+	}
+	PSCHED_SET_PASTPERFECT(q->qidlestart);
+	q->qave = 0;
+	q->qcount = -1;
+}
+
+static int red_init(struct Qdisc *sch, struct pschedctl *pctl)
+{
+	struct red_sched_data *q;
+	struct redctl *ctl = (struct redctl*)pctl->args;
+
+	q = (struct red_sched_data *)sch->data;
+
+	if (pctl->arglen < sizeof(struct redctl))
+		return -EINVAL;
+
+	q->Wlog = ctl->Wlog;
+	q->Alog = ctl->Alog;
+	q->Rlog = ctl->Rlog;
+	q->C1log = ctl->C1log;
+	q->Slog = ctl->Slog;
+	q->qth_min = ctl->qth_min;
+	q->qth_max = ctl->qth_max;
+	q->qmaxbytes = ctl->qmaxbytes;
+	memcpy(q->Stab, ctl->Stab, 256);
+
+	q->qcount = -1;
+	PSCHED_SET_PASTPERFECT(q->qidlestart);
+	return 0;
+}
+
+struct Qdisc_ops red_ops =
+{
+	NULL,
+	"red",
+	0,
+	sizeof(struct red_sched_data),
+	red_enqueue,
+	red_dequeue,
+	red_reset,
+	NULL,
+	red_init,
+	NULL
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&red_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000..65c3906b4
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,333 @@
+/*
+ * net/sched/sch_sfq.c	Stochastic Fairness Queueing scheduler.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/*	Stochastic Fairness Queuing algorithm.
+	=======================================
+
+	Source:
+	Paul E. McKenney "Stochastic Fairness Queuing",
+	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+
+	Paul E. McKenney "Stochastic Fairness Queuing",
+	"Interworking: Research and Experience", v.2, 1991, p.113-131.
+
+
+	See also:
+	M. Shreedhar and George Varghese "Efficient Fair
+	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+
+
+	It is not the thing that usually called (W)FQ nowadays. It does not
+	use any timestamp mechanism, but instead processes queues
+	in round-robin order.
+
+	ADVANTAGE:
+
+	- It is very cheap. Both CPU and memory requirements are minimal.
+
+	DRAWBACKS:
+
+	- "Stochastic" -> It is not 100% fair. 
+	When hash collisions occur, several flows are considred as one.
+
+	- "Round-robin" -> It introduces larger delays than virtual clock
+	based schemes, and should not be used for isolation interactive
+	traffic	from non-interactive. It means, that this scheduler
+	should be used as leaf of CBQ or P3, which put interactive traffic
+	to higher priority band.
+
+	We still need true WFQ for top level CSZ, but using WFQ
+	for the best effort traffic is absolutely pointless:
+	SFQ is superior for this purpose.
+
+	IMPLEMENTATION:
+	This implementation limits maximal queue length to 128;
+	maximal mtu to 2^15-1; number of hash buckets to 1024.
+	The only goal of this restrictions was that all data
+	fitted to one 4K page :-). Struct sfq_sched_data is
+	organized in anti-cache manner: all the data for bucket
+	scattered over different locations. It is not good,
+	but it allowed to put it into 4K.
+
+	It is easy to increase these values.
+*/
+
+#define SFQ_DEPTH		128
+#define SFQ_HASH_DIVISOR	1024
+
+#define SFQ_HASH(a)		0
+
+/* This type should contain at least SFQ_DEPTH*2 values */
+typedef unsigned char sfq_index;
+
+struct sfq_head
+{
+	sfq_index	next;
+	sfq_index	prev;
+};
+
+struct sfq_sched_data
+{
+/* Parameters */
+	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
+
+/* Variables */
+	sfq_index	tail;		/* Index of current slot in round */
+	sfq_index	max_depth;	/* Maximal depth */
+
+	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
+	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
+	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
+	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
+	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
+	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
+};
+
+extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+	int d = q->qs[x].qlen;
+
+	p = d;
+	n = q->dep[d].next;
+	q->dep[x].next = n;
+	q->dep[x].prev = p;
+	q->dep[p].next = q->dep[n].prev = x;
+}
+
+extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+
+	n = q->dep[x].next;
+	p = q->dep[x].prev;
+	q->dep[p].next = n;
+	q->dep[n].prev = p;
+
+	if (n == p && q->max_depth == q->qs[x].qlen + 1)
+		q->max_depth--;
+
+	sfq_link(q, x);
+}
+
+extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+	int d;
+
+	n = q->dep[x].next;
+	p = q->dep[x].prev;
+	q->dep[p].next = n;
+	q->dep[n].prev = p;
+	d = q->qs[x].qlen;
+	if (q->max_depth < d)
+		q->max_depth = d;
+
+	sfq_link(q, x);
+}
+
+static __inline__ void sfq_drop(struct sfq_sched_data *q)
+{
+	struct sk_buff *skb;
+	sfq_index d = q->max_depth;
+
+	/* Queue is full! Find the longest slot and
+	   drop a packet from it */
+
+	if (d != 1) {
+		sfq_index x = q->dep[d].next;
+		skb = q->qs[x].prev;
+		__skb_unlink(skb, &q->qs[x]);
+		kfree_skb(skb, FREE_WRITE);
+		sfq_dec(q, x);
+/*
+		sch->q.qlen--;
+ */
+		return;
+	}
+
+	/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+
+	d = q->next[q->tail];
+	q->next[q->tail] = q->next[d];
+	q->allot[q->next[d]] += q->quantum;
+	skb = q->qs[d].prev;
+	__skb_unlink(skb, &q->qs[d]);
+	kfree_skb(skb, FREE_WRITE);
+	sfq_dec(q, d);
+/*
+	sch->q.qlen--;
+ */
+	q->ht[q->hash[d]] = SFQ_DEPTH;
+	return;
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+	unsigned hash = SFQ_HASH(skb);
+	sfq_index x;
+
+	x = q->ht[hash];
+	if (x == SFQ_DEPTH) {
+		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
+		q->hash[x] = hash;
+	}
+	__skb_queue_tail(&q->qs[x], skb);
+	sfq_inc(q, x);
+	if (q->qs[x].qlen == 1) {		/* The flow is new */
+		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
+			q->tail = x;
+			q->next[x] = x;
+			q->allot[x] = q->quantum;
+		} else {
+			q->next[x] = q->next[q->tail];
+			q->next[q->tail] = x;
+			q->tail = x;
+		}
+	}
+	if (++sch->q.qlen < SFQ_DEPTH-1)
+		return 1;
+
+	sfq_drop(q);
+	return 0;
+}
+
+static struct sk_buff *
+sfq_dequeue(struct Qdisc* sch)
+{
+	struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+	struct sk_buff *skb;
+	sfq_index a, old_a;
+
+	/* No active slots */
+	if (q->tail == SFQ_DEPTH)
+		return NULL;
+
+	a = old_a = q->next[q->tail];
+
+	/* Grab packet */
+	skb = __skb_dequeue(&q->qs[a]);
+	sfq_dec(q, a);
+	sch->q.qlen--;
+
+	/* Is the slot empty? */
+	if (q->qs[a].qlen == 0) {
+		a = q->next[a];
+		if (a == old_a) {
+			q->tail = SFQ_DEPTH;
+			return skb;
+		}
+		q->next[q->tail] = a;
+		q->allot[a] += q->quantum;
+	} else if ((q->allot[a] -= skb->len) <= 0) {
+		q->tail = a;
+		a = q->next[a];
+		q->allot[a] += q->quantum;
+	}
+	return skb;
+}
+
+static void
+sfq_reset(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = sfq_dequeue(sch)) != NULL)
+		kfree_skb(skb, FREE_WRITE);
+}
+
+
+static int sfq_open(struct Qdisc *sch, void *arg)
+{
+	struct sfq_sched_data *q;
+	int i;
+
+	q = (struct sfq_sched_data *)sch->data;
+
+	for (i=0; i<SFQ_HASH_DIVISOR; i++)
+		q->ht[i] = SFQ_DEPTH;
+	for (i=0; i<SFQ_DEPTH; i++) {
+		skb_queue_head_init(&q->qs[i]);
+		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
+		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
+	}
+	q->max_depth = 0;
+	q->tail = SFQ_DEPTH;
+	q->quantum = sch->dev->mtu;
+	if (sch->dev->hard_header)
+		q->quantum += sch->dev->hard_header_len;
+	for (i=0; i<SFQ_DEPTH; i++)
+		sfq_link(q, i);
+	return 0;
+}
+
+
+struct Qdisc_ops sfq_ops =
+{
+	NULL,
+	"sfq",
+	0,
+	sizeof(struct sfq_sched_data),
+	sfq_enqueue,
+	sfq_dequeue,
+	sfq_reset,
+	NULL,
+	sfq_open,
+};
+
+#ifdef MODULE
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&sfq_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000..9869af1d3
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,252 @@
+/*
+ * net/sched/sch_tbf.c	Token Bucket Filter.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/*	Simple Token Bucket Filter.
+	=======================================
+
+	SOURCE.
+
+	None.
+
+	ALGORITHM.
+
+	Sequence of packets satisfy token bucket filter with
+	rate $r$ and depth $b$, if all the numbers defined by:
+	\begin{eqnarray*}
+	n_0 &=& b, \\
+	n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ),
+	\end{eqnarray*}
+	where $t_i$ --- departure time of $i$-th packet and
+	$L_i$ -- its length, never less than zero.
+
+	It is convenient to rescale $n_i$ by factor $r$, so
+	that the sequence has "canonical" form:
+	\[
+	n_0 = b/r,
+	n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r },
+	\]
+
+	If a packet has n_i < 0, we throttle filter
+	by $-n_i$ usecs.
+
+	NOTES.
+
+	If TBF throttles, it starts watchdog timer, which will wake up it
+	after 0...10 msec.
+	If no new packets will arrive during this period,
+	or device will not be awaken by EOI for previous packet,
+	tbf could stop its activity for 10 msec.
+
+	It means that tbf will sometimes introduce pathological
+	10msec delays to flow corresponding to rate*10msec bytes.
+	For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb.
+	This number puts lower reasonbale bound on token bucket depth,
+	but even if depth is larger traffic is erratic at large rates.
+
+	This problem is not specific for THIS implementation. Really,
+	there exists statement that any attempt to shape traffic
+	in transit will increase delays and jitter much more than
+	we expected naively.
+
+	Particularily, it means that delay/jitter sensitive traffic
+	MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches.
+*/
+
+struct tbf_sched_data
+{
+/* Parameters */
+	int		cell_log;	/* 1<<cell_log is quantum of packet size */
+	unsigned long	L_tab[256];	/* Lookup table for L/B values */
+	unsigned long	depth;		/* Token bucket depth/B: MUST BE >= MTU/B */
+	unsigned long	max_bytes;	/* Maximal length of backlog: bytes */
+
+/* Variables */
+	unsigned long	bytes;		/* Current length of backlog */
+	unsigned long	tokens;		/* Current number of tokens */
+	psched_time_t	t_c;		/* Time check-point */
+	struct timer_list wd_timer;	/* Watchdog timer */
+};
+
+#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log])
+
+static int
+tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+	__skb_queue_tail(&sch->q, skb);
+	if ((q->bytes += skb->len) <= q->max_bytes)
+		return 1;
+
+	/* Drop action: undo the things that we just made,
+	 * i.e. make tail drop
+	 */
+
+	__skb_unlink(skb, &sch->q);
+	q->bytes -= skb->len;
+	kfree_skb(skb, FREE_WRITE);
+	return 0;
+}
+
+static void tbf_watchdog(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc*)arg;
+	struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+	q->wd_timer.function = NULL;
+
+	qdisc_wakeup(sch->dev);
+}
+
+
+static struct sk_buff *
+tbf_dequeue(struct Qdisc* sch)
+{
+	struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+	struct sk_buff *skb;
+	
+	skb = __skb_dequeue(&sch->q);
+
+	if (skb) {
+		psched_time_t now;
+		long toks;
+
+		PSCHED_GET_TIME(now);
+
+		toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0)
+			+ q->tokens - L2T(q,skb->len);
+
+		if (toks >= 0) {
+			q->t_c = now;
+			q->tokens = toks <= q->depth ? toks : q->depth;
+			q->bytes -= skb->len;
+			return skb;
+		}
+
+		/* Maybe, we have in queue a shorter packet,
+		   which can be sent now. It sounds cool,
+		   but, however, wrong in principle.
+		   We MUST NOT reorder packets in these curcumstances.
+
+		   Really, if we splitted flow to independent
+		   subflows, it would be very good solution.
+		   Look at sch_csz.c.
+		 */
+		__skb_queue_head(&sch->q, skb);
+
+		if (!sch->dev->tbusy) {
+			if (q->wd_timer.function)
+				del_timer(&q->wd_timer);
+			q->wd_timer.function = tbf_watchdog;
+			q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks);
+			add_timer(&q->wd_timer);
+		}
+	}
+	return NULL;
+}
+
+
+static void
+tbf_reset(struct Qdisc* sch)
+{
+	struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sch->q)) != NULL)
+		kfree_skb(skb, FREE_WRITE);
+	q->bytes = 0;
+	PSCHED_GET_TIME(q->t_c);
+	q->tokens = q->depth;
+	if (q->wd_timer.function) {
+		del_timer(&q->wd_timer);
+		q->wd_timer.function = NULL;
+	}
+}
+
+static int tbf_init(struct Qdisc* sch, void *arg)
+{
+	struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+	struct tbfctl *ctl = (struct tbfctl*)arg;
+
+	PSCHED_GET_TIME(q->t_c);
+	init_timer(&q->wd_timer);
+	q->wd_timer.function = NULL;
+	q->wd_timer.data = (unsigned long)sch;
+	if (ctl) {
+		q->max_bytes = ctl->bytes;
+		q->depth = ctl->depth;
+		q->tokens = q->tokens;
+		q->cell_log = ctl->cell_log;
+		memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long));
+	}
+	return 0;
+}
+
+struct Qdisc_ops tbf_ops =
+{
+	NULL,
+	"tbf",
+	0,
+	sizeof(struct tbf_sched_data),
+	tbf_enqueue,
+	tbf_dequeue,
+	tbf_reset,
+	NULL,
+	tbf_init,
+	NULL,
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+	int err;
+
+	/* Load once and never free it. */
+	MOD_INC_USE_COUNT;
+
+	err = register_qdisc(&tbf_ops);
+	if (err)
+		MOD_DEC_USE_COUNT;
+	return err;
+}
+
+void cleanup_module(void) 
+{
+}
+#endif
author	Ralf Baechle <ralf@linux-mips.org>	1997-12-16 06:06:25 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1997-12-16 06:06:25 +0000
commit	aa944aa3453e47706685bc562711a9e87375941e (patch)
tree	8fb37a65f205a90412917ca2b91c429263ef1790 /net/sched
parent	967c65a99059fd459b956c1588ce0ba227912c4e (diff)