summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1997-12-16 06:06:25 +0000
committerRalf Baechle <ralf@linux-mips.org>1997-12-16 06:06:25 +0000
commitaa944aa3453e47706685bc562711a9e87375941e (patch)
tree8fb37a65f205a90412917ca2b91c429263ef1790 /net/sched
parent967c65a99059fd459b956c1588ce0ba227912c4e (diff)
Merge with Linux 2.1.72, part 2.
The new signal code with exception of the code for the rt signals. The definitions in <asm/siginfo.h> and <asm/ucontext.h> are currently just stolen from the Alpha and will need to be overhauled.
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/.cvsignore1
-rw-r--r--net/sched/Makefile71
-rw-r--r--net/sched/sch_cbq.c839
-rw-r--r--net/sched/sch_csz.c832
-rw-r--r--net/sched/sch_fifo.c179
-rw-r--r--net/sched/sch_generic.c541
-rw-r--r--net/sched/sch_prio.c146
-rw-r--r--net/sched/sch_red.c303
-rw-r--r--net/sched/sch_sfq.c333
-rw-r--r--net/sched/sch_tbf.c252
10 files changed, 3497 insertions, 0 deletions
diff --git a/net/sched/.cvsignore b/net/sched/.cvsignore
new file mode 100644
index 000000000..4671378ae
--- /dev/null
+++ b/net/sched/.cvsignore
@@ -0,0 +1 @@
+.depend
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000..cbb6704c1
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,71 @@
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := sched.o
+
+O_OBJS := sch_generic.o
+
+ifeq ($(CONFIG_NET_SCH_CBQ), y)
+O_OBJS += sch_cbq.o
+else
+ ifeq ($(CONFIG_NET_SCH_CBQ), m)
+ M_OBJS += sch_cbq.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_CSZ), y)
+O_OBJS += sch_csz.o
+else
+ ifeq ($(CONFIG_NET_SCH_CSZ), m)
+ M_OBJS += sch_csz.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_SFQ), y)
+O_OBJS += sch_sfq.o
+else
+ ifeq ($(CONFIG_NET_SCH_SFQ), m)
+ M_OBJS += sch_sfq.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_RED), y)
+O_OBJS += sch_red.o
+else
+ ifeq ($(CONFIG_NET_SCH_RED), m)
+ M_OBJS += sch_red.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_TBF), y)
+O_OBJS += sch_tbf.o
+else
+ ifeq ($(CONFIG_NET_SCH_TBF), m)
+ M_OBJS += sch_tbf.o
+ endif
+endif
+
+
+ifeq ($(CONFIG_NET_SCH_PFIFO), y)
+O_OBJS += sch_fifo.o
+else
+ ifeq ($(CONFIG_NET_SCH_PFIFO), m)
+ M_OBJS += sch_fifo.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_SCH_PRIO), y)
+O_OBJS += sch_prio.o
+else
+ ifeq ($(CONFIG_NET_SCH_PRIO), m)
+ M_OBJS += sch_prio.o
+ endif
+endif
+
+include $(TOPDIR)/Rules.make
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000..626afe555
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,839 @@
+/*
+ * net/sched/sch_cbq.c Class-Based Queueing discipline.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* Class-Based Queueing (CBQ) algorithm.
+ =======================================
+
+ Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+ Management Models for Packet Networks",
+ IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+ [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
+
+ [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+ Parameters", 1996
+
+ Algorithm skeleton is taken from from NS simulator cbq.cc.
+
+ -----------------------------------------------------------------------
+
+ Differences from NS version.
+
+ --- WRR algorith is different. Our version looks more reasonable :-)
+ and fair when quanta are allowed to be less than MTU.
+
+ --- cl->aveidle is REALLY limited from below by cl->minidle.
+ Seems, it was bug in NS.
+
+ --- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel".
+ When depth increases we expect, that the thing becomes lower, does not it? :-)
+ Besides that, "depth" word is semantically overloaded ---
+ "token bucket depth", "sfq depth"... Besides that, the algorithm
+ was called "top-LEVEL sharing".
+
+ PROBLEM.
+
+ --- Linux has no EOI event at the moment, so that we cannot
+ estimate true class idle time. Three workarounds are possible,
+ all of them have drawbacks:
+
+ 1. (as now) Consider the next dequeue event as sign that
+ previous packet is finished. It is wrong because of ping-pong
+ buffers, but on permanently loaded link it is true.
+ 2. (NS approach) Use as link busy time estimate skb->leb/"physical
+ bandwidth". Even more wrong f.e. on ethernet real busy time much
+ higher because of collisions.
+ 3. (seems, the most clever) Split net bh to two parts:
+ NETRX_BH (for received packets) and preserve NET_BH for transmitter.
+ It will not require driver changes (NETRX_BH flag will be set
+ in netif_rx), but will allow to trace EOIs more precisely
+ and will save useless checks in net_bh. Besides that we will
+ have to eliminate random calling hard_start_xmit with dev->tbusy flag
+ (done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit
+ MUST succeed; failed packets will be dropped on the floor.
+*/
+
+#define CBQ_TOPLEVEL_SHARING
+/* #define CBQ_NO_TRICKERY */
+
+#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class)
+
+struct cbq_class
+{
+/* Parameters */
+ int priority; /* priority */
+#ifdef CBQ_TOPLEVEL_SHARING
+ int level; /* level of the class in hierarchy:
+ 0 for leaf classes, and maximal
+ level of childrens + 1 for nodes.
+ */
+#endif
+
+ long maxidle; /* Class paramters: see below. */
+ long minidle;
+ int filter_log;
+#ifndef CBQ_NO_TRICKERY
+ long extradelay;
+#endif
+
+ long quantum; /* Allotment per WRR round */
+ long rquantum; /* Relative allotment: see below */
+
+ int cell_log;
+ unsigned long L_tab[256];
+
+ struct Qdisc *qdisc; /* ptr to CBQ discipline */
+ struct cbq_class *root; /* Ptr to root class;
+ root can be not unique.
+ */
+ struct cbq_class *parent; /* Ptr to parent in the class tree */
+ struct cbq_class *borrow; /* NULL if class is bandwidth limited;
+ parent otherwise */
+
+ struct Qdisc *q; /* Elementary queueing discipline */
+ struct cbq_class *next; /* next class in this priority band */
+
+ struct cbq_class *next_alive; /* next class with backlog in this priority band */
+
+/* Variables */
+ psched_time_t last;
+ psched_time_t undertime;
+ long avgidle;
+ long deficit; /* Saved deficit for WRR */
+ char awake; /* Class is in alive list */
+
+#if 0
+ void (*overlimit)(struct cbq_class *cl);
+#endif
+};
+
+#define L2T(cl,len) ((cl)->L_tab[(len)>>(cl)->cell_log])
+
+struct cbq_sched_data
+{
+ struct cbq_class *classes[CBQ_MAXPRIO]; /* List of all classes */
+ int nclasses[CBQ_MAXPRIO];
+ unsigned quanta[CBQ_MAXPRIO];
+ unsigned mtu;
+ int cell_log;
+ unsigned long L_tab[256];
+ struct cbq_class *fallback_class;
+
+ unsigned activemask;
+ struct cbq_class *active[CBQ_MAXPRIO]; /* List of all classes
+ with backlog */
+ struct cbq_class *last_sent;
+ int last_sent_len;
+
+ psched_time_t now; /* Cached timestamp */
+
+ struct timer_list wd_timer; /* Wathchdog timer, that
+ started when CBQ has
+ backlog, but cannot
+ transmit just now */
+ unsigned long wd_expires;
+#ifdef CBQ_TOPLEVEL_SHARING
+ struct cbq_class *borrowed;
+ int toplevel;
+#endif
+};
+
+/*
+ WRR quanta
+ ----------
+
+ cl->quantum is number added to class allotment on every round.
+ cl->rquantum is "relative" quantum.
+
+ For real-time classes:
+
+ cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio]
+
+ where q->quanta[prio] is sum of all rquanta for given priority.
+ cl->rquantum can be identified with absolute rate of the class
+ in arbitrary units (f.e. bytes/sec)
+
+ In this case, delay introduced by round-robin was estimated by
+ Sally Floyd [2] as:
+
+ D = q->nclasses*q->mtu/(bandwidth/2)
+
+ Note, that D does not depend on class rate (it is very bad),
+ but not much worse than Gallager-Parekh estimate for CSZ
+ C/R = q->mtu/rate, when real-time classes have close rates.
+
+ For not real-time classes this folmula is not necessary,
+ so that cl->quantum can be set to any reasonable not zero value.
+ Apparently, it should be proportional to class rate, if the
+ rate is not zero.
+*/
+
+/*
+ maxidle, minidle, extradelay
+ ----------------------------
+
+ CBQ estimator calculates smoothed class idle time cl->aveidle,
+ considering class as virtual interface with corresponding bandwidth.
+ When cl->aveidle wants to be less than zero, class is overlimit.
+ When it is positive, class is underlimit.
+
+ * maxidle bounds aveidle from above.
+ It controls maximal length of burst in this class after
+ long period of idle time. Burstness of active class
+ is controlled by filter constant cl->filter_log,
+ but this number is related to burst length only indirectly.
+
+ * minidle is a negative number, normally set to zero.
+ Setting it to not zero value allows avgidle to drop
+ below zero, effectively penalizing class, when it is overlimit.
+ When the class load will decrease, it will take a time to
+ raise negative avgidle to put the class at limit.
+ It should be set to zero for leaf classes.
+
+ * extradelay is penalty in delay, when a class goes overlimit.
+ I believe this parameter is useless and confusing.
+ Setting it to not zero forces class to accumulate
+ its "idleness" for extradelay and then send BURST of packets
+ until going to overlimit again. Non-sense.
+
+ For details see [1] and [3].
+
+ Really, minidle and extradelay are irrelevant to real scheduling
+ task. As I understand, SF&VJ introduced them to experiment
+ with CBQ simulator in attempts to fix erratic behaviour
+ of ancestor-only (and, partially, top-level) algorithm.
+
+ WARNING.
+
+ User passes them measured in usecs, but cl->minidle,
+ cl->maxidle and cl->aveidle are scaled with cl->filter_log
+ in the text of the scheduler.
+*/
+
+/*
+ A packet has just been enqueued on the empty class.
+ cbq_wakeup_class adds it to the tail of active class list
+ of its priority band.
+ */
+
+static __inline__ void cbq_wakeup_class(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
+ int prio = cl->priority;
+ struct cbq_class *cl_tail;
+
+ cl->awake = 1;
+
+ cl_tail = q->active[prio];
+ q->active[prio] = cl;
+
+ if (cl_tail != NULL) {
+ cl->next_alive = cl_tail->next_alive;
+ cl->deficit = 0;
+ } else {
+ cl->next_alive = cl;
+ q->activemask |= (1<<prio);
+ cl->deficit = cl->quantum;
+ }
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct cbq_class *cl = CBQ_CLASSIFIER(skb, q);
+
+ if (cl->q->enqueue(skb, cl->q) == 1) {
+ sch->q.qlen++;
+
+#ifdef CBQ_TOPLEVEL_SHARING
+ if (q->toplevel > 0) {
+ psched_time_t now;
+ PSCHED_GET_TIME(now);
+ if (PSCHED_TLESS(cl->undertime, now))
+ q->toplevel = 0;
+ else if (q->toplevel > 1 && cl->borrow &&
+ PSCHED_TLESS(cl->borrow->undertime, now))
+ q->toplevel = 1;
+ }
+#endif
+ if (!cl->awake)
+ cbq_wakeup_class(cl);
+ return 1;
+ }
+ return 0;
+}
+
+static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ long delay;
+
+ delay = PSCHED_TDIFF(cl->undertime, q->now);
+ if (q->wd_expires == 0 || q->wd_expires - delay > 0)
+ q->wd_expires = delay;
+}
+
+static void cbq_watchdog(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc*)arg;
+ struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
+
+ q->wd_timer.expires = 0;
+ q->wd_timer.function = NULL;
+ qdisc_wakeup(sch->dev);
+}
+
+static __inline__ void
+cbq_update(struct cbq_sched_data *q)
+{
+ struct cbq_class *cl;
+
+ for (cl = q->last_sent; cl; cl = cl->parent) {
+ long avgidle = cl->avgidle;
+ long idle;
+
+ /*
+ (now - last) is total time between packet right edges.
+ (last_pktlen/rate) is "virtual" busy time, so that
+
+ idle = (now - last) - last_pktlen/rate
+ */
+
+ idle = PSCHED_TDIFF(q->now, cl->last)
+ - L2T(cl, q->last_sent_len);
+
+ /* true_avgidle := (1-W)*true_avgidle + W*idle,
+ where W=2^{-filter_log}. But cl->avgidle is scaled:
+ cl->avgidle == true_avgidle/W,
+ hence:
+ */
+ avgidle += idle - (avgidle>>cl->filter_log);
+
+ if (avgidle <= 0) {
+ /* Overlimit or at-limit */
+#ifdef CBQ_NO_TRICKERY
+ avgidle = 0;
+#else
+ if (avgidle < cl->minidle)
+ avgidle = cl->minidle;
+#endif
+
+ /* This line was missing in NS. */
+ cl->avgidle = avgidle;
+
+ /* Calculate expected time, when this class
+ will be allowed to send.
+ It will occur, when:
+ (1-W)*true_avgidle + W*delay = 0, i.e.
+ idle = (1/W - 1)*(-true_avgidle)
+ or
+ idle = (1 - W)*(-cl->avgidle);
+
+ That is not all.
+ We want to set undertime to the moment, when
+ the class is allowed to start next transmission i.e.
+ (undertime + next_pktlen/phys_bandwidth)
+ - now - next_pktlen/rate = idle
+ or
+ undertime = now + idle + next_pktlen/rate
+ - next_pktlen/phys_bandwidth
+
+ We do not know next packet length, but can
+ estimate it with average packet length
+ or current packet_length.
+ */
+
+ idle = (-avgidle) - ((-avgidle) >> cl->filter_log);
+ idle += L2T(q, q->last_sent_len);
+ idle -= L2T(cl, q->last_sent_len);
+ PSCHED_TADD2(q->now, idle, cl->undertime);
+#ifndef CBQ_NO_TRICKERY
+ /* Do not forget extra delay :-) */
+ PSCHED_TADD(cl->undertime, cl->extradelay);
+#endif
+ } else {
+ /* Underlimit */
+
+ PSCHED_SET_PASTPERFECT(cl->undertime);
+ if (avgidle > cl->maxidle)
+ cl->avgidle = cl->maxidle;
+ else
+ cl->avgidle = avgidle;
+ }
+ cl->last = q->now;
+ }
+
+#ifdef CBQ_TOPLEVEL_SHARING
+ cl = q->last_sent;
+
+ if (q->borrowed && q->toplevel >= q->borrowed->level) {
+ if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime))
+ q->toplevel = CBQ_MAXLEVEL;
+ else if (q->borrowed != cl)
+ q->toplevel = q->borrowed->level;
+ }
+#endif
+
+ q->last_sent = NULL;
+}
+
+static __inline__ int
+cbq_under_limit(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
+ struct cbq_class *this_cl = cl;
+
+ if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL)
+ return 1;
+
+ if (PSCHED_TLESS(cl->undertime, q->now)) {
+ q->borrowed = cl;
+ return 1;
+ }
+
+ while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+ PSCHED_TLESS(q->now, cl->undertime)) {
+ cl = cl->borrow;
+ if (cl == NULL
+#ifdef CBQ_TOPLEVEL_SHARING
+ || cl->level > q->toplevel
+#endif
+ ) {
+#if 0
+ this_cl->overlimit(this_cl);
+#else
+ cbq_delay(q, this_cl);
+#endif
+ return 0;
+ }
+ }
+ q->borrowed = cl;
+ return 1;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct cbq_class *cl_tail, *cl_prev, *cl;
+ struct sk_buff *skb;
+ int deficit;
+
+ cl_tail = cl_prev = q->active[prio];
+ cl = cl_prev->next_alive;
+
+ do {
+ deficit = 0;
+
+ /* Start round */
+ do {
+ /* Class is empty */
+ if (cl->q->q.qlen == 0)
+ goto skip_class;
+
+ if (fallback) {
+ /* Fallback pass: all classes are overlimit;
+ we send from the first class that is allowed
+ to borrow.
+ */
+
+ if (cl->borrow == NULL)
+ goto skip_class;
+ } else {
+ /* Normal pass: check that class is under limit */
+ if (!cbq_under_limit(cl))
+ goto skip_class;
+ }
+
+ if (cl->deficit <= 0) {
+ /* Class exhausted its allotment per this
+ round.
+ */
+ deficit = 1;
+ goto next_class;
+ }
+
+ skb = cl->q->dequeue(cl->q);
+
+ /* Class did not give us any skb :-(
+ It could occur if cl->q == "tbf"
+ */
+ if (skb == NULL)
+ goto skip_class;
+
+ cl->deficit -= skb->len;
+ q->last_sent = cl;
+ q->last_sent_len = skb->len;
+
+ if (cl->deficit <= 0) {
+ q->active[prio] = cl;
+ cl = cl->next_alive;
+ cl->deficit += cl->quantum;
+ }
+ return skb;
+
+skip_class:
+ cl->deficit = 0;
+
+ if (cl->q->q.qlen == 0) {
+ /* Class is empty, declare it dead */
+ cl_prev->next_alive = cl->next_alive;
+ cl->awake = 0;
+
+ /* Did cl_tail point to it? */
+ if (cl == cl_tail) {
+ /* Repair it! */
+ cl_tail = cl_prev;
+
+ /* Was it the last class in this band? */
+ if (cl == cl_tail) {
+ /* Kill the band! */
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ return NULL;
+ }
+ }
+ }
+
+next_class:
+ cl_prev = cl;
+ cl = cl->next_alive;
+ cl->deficit += cl->quantum;
+ } while (cl_prev != cl_tail);
+ } while (deficit);
+
+ q->active[prio] = cl_prev;
+
+ return NULL;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch, int fallback)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct sk_buff *skb;
+ unsigned activemask;
+
+ activemask = q->activemask;
+ while (activemask) {
+ int prio = ffz(~activemask);
+ activemask &= ~(1<<prio);
+ skb = cbq_dequeue_prio(sch, prio, fallback);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+
+ PSCHED_GET_TIME(q->now);
+
+ if (q->last_sent)
+ cbq_update(q);
+
+ q->wd_expires = 0;
+
+ skb = cbq_dequeue_1(sch, 0);
+ if (skb)
+ return skb;
+
+ /* All the classes are overlimit.
+ Search for overlimit class, which is allowed to borrow
+ and use it as fallback case.
+ */
+
+#ifdef CBQ_TOPLEVEL_SHARING
+ q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+ skb = cbq_dequeue_1(sch, 1);
+ if (skb)
+ return skb;
+
+ /* No packets in scheduler or nobody wants to give them to us :-(
+ Sigh... start watchdog timer in the last case. */
+
+ if (sch->q.qlen && q->wd_expires) {
+ if (q->wd_timer.function)
+ del_timer(&q->wd_timer);
+ q->wd_timer.function = cbq_watchdog;
+ q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires);
+ add_timer(&q->wd_timer);
+ }
+ return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+ struct cbq_class *cl;
+
+ for (cl = this->parent; cl; cl = cl->parent) {
+ if (cl->level > this->level)
+ return;
+ cl->level = this->level + 1;
+ this = cl;
+ }
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+ struct cbq_class *cl;
+
+ if (q->quanta[prio] == 0)
+ return;
+
+ for (cl = q->classes[prio]; cl; cl = cl->next) {
+ if (cl->rquantum)
+ cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/
+ q->quanta[prio];
+ }
+}
+
+static __inline__ int cbq_unlink_class(struct cbq_class *this)
+{
+ struct cbq_class *cl, **clp;
+ struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+ for (clp = &q->classes[this->priority]; (cl = *clp) != NULL;
+ clp = &cl->next) {
+ if (cl == this) {
+ *clp = cl->next;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int cbq_prune(struct cbq_class *this)
+{
+ struct cbq_class *cl;
+ int prio = this->priority;
+ struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+ qdisc_reset(this->q);
+
+ if (cbq_unlink_class(this))
+ return -ENOENT;
+
+ if (this->awake) {
+ struct cbq_class *cl_prev = q->active[prio];
+ do {
+ cl = cl_prev->next_alive;
+ if (cl == this) {
+ cl_prev->next_alive = cl->next_alive;
+
+ if (cl == q->active[prio]) {
+ q->active[prio] = cl;
+ if (cl == q->active[prio]) {
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ break;
+ }
+ }
+
+ cl = cl->next_alive;
+ cl->deficit += cl->quantum;
+ break;
+ }
+ } while ((cl_prev = cl) != q->active[prio]);
+ }
+
+ --q->nclasses[prio];
+ if (this->rquantum) {
+ q->quanta[prio] -= this->rquantum;
+ cbq_normalize_quanta(q, prio);
+ }
+
+ if (q->fallback_class == this)
+ q->fallback_class = NULL;
+
+ this->parent = NULL;
+ this->borrow = NULL;
+ this->root = this;
+ this->qdisc = NULL;
+ return 0;
+}
+
+static int cbq_graft(struct cbq_class *this, struct cbq_class *parent)
+{
+ struct cbq_class *cl, **clp;
+ int prio = this->priority;
+ struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
+
+ qdisc_reset(this->q);
+
+
+ for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) {
+ if (cl == this)
+ return -EBUSY;
+ }
+
+ cl->next = NULL;
+ *clp = cl;
+
+ cl->parent = parent;
+ cl->borrow = parent;
+ cl->root = parent ? parent->root : cl;
+
+ ++q->nclasses[prio];
+ if (this->rquantum) {
+ q->quanta[prio] += this->rquantum;
+ cbq_normalize_quanta(q, prio);
+ }
+
+ cbq_adjust_levels(this);
+
+ return 0;
+}
+
+
+static void
+cbq_reset(struct Qdisc* sch)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct cbq_class *cl;
+ int prio;
+
+ q->activemask = 0;
+ q->last_sent = NULL;
+ if (q->wd_timer.function) {
+ del_timer(&q->wd_timer);
+ q->wd_timer.expires = 0;
+ q->wd_timer.function = NULL;
+ }
+#ifdef CBQ_TOPLEVEL_SHARING
+ q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+ for (prio = 0; prio < CBQ_MAXPRIO; prio++) {
+ q->active[prio] = NULL;
+
+ for (cl = q->classes[prio]; cl; cl = cl->next) {
+ qdisc_reset(cl->q);
+
+ cl->next_alive = NULL;
+ PSCHED_SET_PASTPERFECT(cl->undertime);
+ cl->avgidle = 0;
+ cl->deficit = 0;
+ cl->awake = 0;
+ }
+ }
+}
+
+static void
+cbq_destroy(struct Qdisc* sch)
+{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct cbq_class *cl, **clp;
+ int prio;
+
+ for (prio = 0; prio < CBQ_MAXPRIO; prio++) {
+ struct cbq_class *cl_head = q->classes[prio];
+
+ for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) {
+ qdisc_destroy(cl->q);
+ kfree(cl);
+ }
+ }
+}
+
+static int cbq_control(struct Qdisc *sch, void *arg)
+{
+ struct cbq_sched_data *q;
+
+ q = (struct cbq_sched_data *)sch->data;
+
+ /* Do attachment here. It is the last thing to do. */
+
+ return -EINVAL;
+}
+
+static int cbq_init(struct Qdisc *sch, void *arg)
+{
+ struct cbq_sched_data *q;
+ struct cbqctl *ctl = (struct cbqctl*)arg;
+
+ q = (struct cbq_sched_data *)sch->data;
+ init_timer(&q->wd_timer);
+ q->wd_timer.data = (unsigned long)sch;
+#ifdef CBQ_TOPLEVEL_SHARING
+ q->toplevel = CBQ_MAXLEVEL;
+#endif
+
+ return 0;
+}
+
+
+struct Qdisc_ops cbq_ops =
+{
+ NULL,
+ "cbq",
+ 0,
+ sizeof(struct cbq_sched_data),
+ cbq_enqueue,
+ cbq_dequeue,
+ cbq_reset,
+ cbq_destroy,
+ cbq_init,
+ cbq_control,
+};
+
+#ifdef MODULE
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&cbq_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c
new file mode 100644
index 000000000..dbc05d31b
--- /dev/null
+++ b/net/sched/sch_csz.c
@@ -0,0 +1,832 @@
+/*
+ * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/* Clark-Shenker-Zhang algorithm.
+ =======================================
+
+ SOURCE.
+
+ David D. Clark, Scott Shenker and Lixia Zhang
+ "Supporting Real-Time Applications in an Integrated Services Packet
+ Network: Architecture and Mechanism".
+
+ CBQ presents a flexible universal algorithm for packet scheduling,
+ but it has pretty poor delay characteristics.
+ Round-robin scheduling and link-sharing goals
+ apparently contradict to minimization of network delay and jitter.
+ Moreover, correct handling of predicted flows seems to be
+ impossible in CBQ.
+
+ CSZ presents more precise but less flexible and less efficient
+ approach. As I understand, the main idea is to create
+ WFQ flows for each guaranteed service and to allocate
+ the rest of bandwith to dummy flow-0. Flow-0 comprises
+ the predicted services and the best effort traffic;
+ it is handled by a priority scheduler with the highest
+ priority band allocated for predicted services, and the rest ---
+ to the best effort packets.
+
+ Note, that in CSZ flows are NOT limited to their bandwidth.
+ It is supposed, that flow passed admission control at the edge
+ of QoS network and it more need no shaping. Any attempt to improve
+ the flow or to shape it to a token bucket at intermediate hops
+ will introduce undesired delays and raise jitter.
+
+ At the moment CSZ is the only scheduler that provides
+ real guaranteed service. Another schemes (including CBQ)
+ do not provide guaranteed delay and randomize jitter.
+ There exists the statement (Sally Floyd), that delay
+ can be estimated by a IntServ compliant formulae.
+ This result is true formally, but it is wrong in principle.
+ At first, it ignores delays introduced by link sharing.
+ And the second (and main) it limits bandwidth,
+ it is fatal flaw.
+
+ ALGORITHM.
+
+ --- Notations.
+
+ $B$ is link bandwidth (bits/sec).
+
+ $I$ is set of all flows, including flow $0$.
+ Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
+ $\sum_{a \in I} r_a = 1$.
+
+ --- Flow model.
+
+ Let $m_a$ is number of backlogged bits in flow $a$.
+ The flow is {\em active }, if $m_a > 0$.
+ This number is discontinuous function of time;
+ when a packet $i$ arrives:
+ \[
+ m_a(t_i+0) - m_a(t_i-0) = L^i,
+ \]
+ where $L^i$ is the length of arrived packet.
+ The flow queue is drained continuously until $m_a == 0$:
+ \[
+ {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
+ \]
+ I.e. flow rates are their allocated rates proportionally
+ scaled to take all available link bandwidth. Apparently,
+ it is not the only possible policy. F.e. CBQ classes
+ without borrowing would be modelled by:
+ \[
+ {d m_a \over dt} = - B r_a .
+ \]
+ More complicated hierarchical bandwidth allocation
+ policies are possible, but, unfortunately, basic
+ flows equation have simple solution only for proportional
+ scaling.
+
+ --- Departure times.
+
+ We calculate time until the last bit of packet will be sent:
+ \[
+ E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
+ \]
+ where $\delta_a(t)$ is number of bits drained since $t_i$.
+ We have to evaluate $E_a^i$ for all queued packets,
+ then find packet with minimal $E_a^i$ and send it.
+
+ It sounds good, but direct implementation of the algorithm
+ is absolutely infeasible. Luckily, if flow rates
+ are scaled proportionally, the equations have simple solution.
+
+ The differential equation for $E_a^i$ is
+ \[
+ {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
+ { B \over \sum_{b \in A} r_b}
+ \]
+ with initial condition
+ \[
+ E_a^i (t_i) = { m_a(t_i) \over r_a } .
+ \]
+
+ Let's introduce an auxiliary function $R(t)$:
+
+ --- Round number.
+
+ Consider the following model: we rotate over active flows,
+ sending $r_a B$ bits from every flow, so that we send
+ $B \sum_{a \in A} r_a$ bits per round, that takes
+ $\sum_{a \in A} r_a$ seconds.
+
+ Hence, $R(t)$ (round number) is monotonically increasing
+ linear function of time when $A$ is not changed
+ \[
+ { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
+ \]
+ and it is continuous when $A$ changes.
+
+ The central observation is that the quantity
+ $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
+ $R(t)$ does not depend on flow, so that $F_a^i$ can be
+ calculated only once on packet arrival, and we need not
+ recalculation of $E$ numbers and resorting queues.
+ Number $F_a^i$ is called finish number of the packet.
+ It is just value of $R(t)$, when the last bit of packet
+ will be sent out.
+
+ Maximal finish number on flow is called finish number of flow
+ and minimal one is "start number of flow".
+ Apparently, flow is active if and only if $F_a \leq R$.
+
+ When packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
+ we calculate number $F_a^i$ as:
+
+ If flow was inactive ($F_a < R$):
+ $F_a^i = R(t) + {L_i \over B r_a}$
+ otherwise
+ $F_a^i = F_a + {L_i \over B r_a}$
+
+ These equations complete the algorithm specification.
+
+ It looks pretty hairy, but there exists a simple
+ procedure for solving these equations.
+ See procedure csz_update(), that is a generalization of
+ algorithm from S. Keshav's thesis Chapter 3
+ "Efficient Implementation of Fair Queeing".
+
+ NOTES.
+
+ * We implement only the simplest variant of CSZ,
+ when flow-0 is explicit 4band priority fifo.
+ It is bad, but we need "peek" operation in addition
+ to "dequeue" to implement complete CSZ.
+ I do not want to make it, until it is not absolutely
+ necessary.
+
+ * A primitive support for token bucket filtering
+ presents too. It directly contradicts to CSZ, but
+ though the Internet is on the globe ... :-)
+ yet "the edges of the network" really exist.
+
+ BUGS.
+
+ * Fixed point arithmetic is overcomplicated, suboptimal and even
+ wrong. Check it later.
+*/
+
+
+/* This number is arbitrary */
+
+#define CSZ_MAX_GUARANTEED 16
+
+#define CSZ_FLOW_ID(skb) (CSZ_MAX_GUARANTEED)
+
+struct csz_head
+{
+ struct csz_head *snext;
+ struct csz_head *sprev;
+ struct csz_head *fnext;
+ struct csz_head *fprev;
+};
+
+struct csz_flow
+{
+ struct csz_head *snext;
+ struct csz_head *sprev;
+ struct csz_head *fnext;
+ struct csz_head *fprev;
+
+/* Parameters */
+ unsigned long rate; /* Flow rate. Fixed point is at rate_log */
+ unsigned long *L_tab; /* Lookup table for L/(B*r_a) values */
+ unsigned long max_bytes; /* Maximal length of queue */
+#ifdef CSZ_PLUS_TBF
+ unsigned long depth; /* Depth of token bucket, normalized
+ as L/(B*r_a) */
+#endif
+
+/* Variables */
+#ifdef CSZ_PLUS_TBF
+ unsigned long tokens; /* Tokens number: usecs */
+ psched_time_t t_tbf;
+ unsigned long R_tbf;
+ int throttled;
+#endif
+ unsigned peeked;
+ unsigned long start; /* Finish number of the first skb */
+ unsigned long finish; /* Finish number of the flow */
+
+ struct sk_buff_head q; /* FIFO queue */
+};
+
+#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log])
+
+struct csz_sched_data
+{
+/* Parameters */
+ unsigned char cell_log; /* 1<<cell_log is quantum of packet size */
+ unsigned char rate_log; /* fixed point position for rate;
+ * really we need not it */
+ unsigned char R_log; /* fixed point position for round number */
+ unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs;
+ * 21 <-> 2.1sec is MAXIMAL value */
+
+/* Variables */
+#ifdef CSZ_PLUS_TBF
+ struct timer_list wd_timer;
+ long wd_expires;
+#endif
+ psched_time_t t_c; /* Time check-point */
+ unsigned long R_c; /* R-number check-point */
+ unsigned long rate; /* Current sum of rates of active flows */
+ struct csz_head s; /* Flows sorted by "start" */
+ struct csz_head f; /* Flows sorted by "finish" */
+
+ struct sk_buff_head other[4];/* Predicted (0) and the best efforts
+ classes (1,2,3) */
+ struct csz_flow flow[CSZ_MAX_GUARANTEED]; /* Array of flows */
+};
+
+/* These routines (csz_insert_finish and csz_insert_start) are
+ the most time consuming part of all the algorithm.
+
+ We insert to sorted list, so that time
+ is linear with respect to number of active flows in the worst case.
+ Note that we have not very large number of guaranteed flows,
+ so that logarithmic algorithms (heap etc.) are useless,
+ they are slower than linear one when length of list <= 32.
+
+ Heap would take sence if we used WFQ for best efforts
+ flows, but SFQ is better choice in this case.
+ */
+
+
+/* Insert flow "this" to the list "b" before
+ flow with greater finish number.
+ */
+
+#if 0
+/* Scan forward */
+extern __inline__ void csz_insert_finish(struct csz_head *b,
+ struct csz_flow *this)
+{
+ struct csz_head *f = b->fnext;
+ unsigned long finish = this->finish;
+
+ while (f != b) {
+ if (((struct csz_flow*)f)->finish - finish > 0)
+ break;
+ f = f->fnext;
+ }
+ this->fnext = f;
+ this->fprev = f->fprev;
+ this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
+}
+#else
+/* Scan backward */
+extern __inline__ void csz_insert_finish(struct csz_head *b,
+ struct csz_flow *this)
+{
+ struct csz_head *f = b->fprev;
+ unsigned long finish = this->finish;
+
+ while (f != b) {
+ if (((struct csz_flow*)f)->finish - finish <= 0)
+ break;
+ f = f->fprev;
+ }
+ this->fnext = f->fnext;
+ this->fprev = f;
+ this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
+}
+#endif
+
+/* Insert flow "this" to the list "b" before
+ flow with greater start number.
+ */
+
+extern __inline__ void csz_insert_start(struct csz_head *b,
+ struct csz_flow *this)
+{
+ struct csz_head *f = b->snext;
+ unsigned long start = this->start;
+
+ while (f != b) {
+ if (((struct csz_flow*)f)->start - start > 0)
+ break;
+ f = f->snext;
+ }
+ this->snext = f;
+ this->sprev = f->sprev;
+ this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
+}
+
+
+/* Calculate and return current round number.
+ It is another time consuming part, but
+ it is impossible to avoid it.
+
+ Fixed point arithmetic is not ... does not ... Well, it is just CRAP.
+ */
+
+static unsigned long csz_update(struct Qdisc *sch)
+{
+ struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
+ struct csz_flow *a;
+ unsigned long F;
+ unsigned long tmp;
+ psched_time_t now;
+ unsigned long delay;
+ unsigned long R_c;
+
+ PSCHED_GET_TIME(now);
+ delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
+
+ if (delay>>q->delta_log) {
+do_reset:
+ /* Delta is too large.
+ It is possible if MTU/BW > 1<<q->delta_log
+ (i.e. configuration error) or because of hardware
+ fault. We have no choice...
+ */
+ qdisc_reset(sch);
+ return 0;
+ }
+
+ q->t_c = now;
+
+ for (;;) {
+ a = (struct csz_flow*)q->f.fnext;
+
+ /* No more active flows. Reset R and exit. */
+ if (a == (struct csz_flow*)&q->f) {
+#ifdef CSZ_DEBUG
+ if (q->rate) {
+ printk("csz_update: rate!=0 on inactive csz\n");
+ q->rate = 0;
+ }
+#endif
+ q->R_c = 0;
+ return 0;
+ }
+
+ F = a->finish;
+
+#ifdef CSZ_DEBUG
+ if (q->rate == 0) {
+ printk("csz_update: rate=0 on active csz\n");
+ goto do_reset;
+ }
+#endif
+
+ /*
+ * tmp = (t - q->t_c)/q->rate;
+ */
+
+ tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
+
+ tmp += q->R_c;
+
+ /* OK, this flow (and all flows with greater
+ finish numbers) is still active */
+ if (F - tmp > 0)
+ break;
+
+ /* It is more not active */
+
+ a->fprev->fnext = a->fnext;
+ a->fnext->fprev = a->fprev;
+
+ /*
+ * q->t_c += (F - q->R_c)*q->rate
+ */
+
+ tmp = ((F-q->R_c)*q->rate)<<q->R_log;
+ R_c = F;
+ q->rate -= a->rate;
+
+ if (delay - tmp >= 0) {
+ delay -= tmp;
+ continue;
+ }
+ delay = 0;
+ }
+
+ q->R_c = tmp;
+ return tmp;
+}
+
+static int
+csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ unsigned flow_id = CSZ_FLOW_ID(skb);
+ unsigned long R;
+ int prio;
+ struct csz_flow *this;
+
+ if (flow_id >= CSZ_MAX_GUARANTEED) {
+ prio = flow_id - CSZ_MAX_GUARANTEED;
+ flow_id = 0;
+ }
+
+ this = &q->flow[flow_id];
+ if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) {
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+ }
+
+ R = csz_update(sch);
+
+ if (this->finish - R >= 0) {
+ /* It was active */
+ this->finish += L2R(q,this,skb->len);
+ } else {
+ /* It is inactive; activate it */
+ this->finish = R + L2R(q,this,skb->len);
+ q->rate += this->rate;
+ csz_insert_finish(&q->f, this);
+ }
+
+ /* If this flow was empty, remember start number
+ and insert it into start queue */
+ if (this->q.qlen == 0) {
+ this->start = this->finish;
+ csz_insert_start(&q->s, this);
+ }
+ if (flow_id)
+ skb_queue_tail(&this->q, skb);
+ else
+ skb_queue_tail(&q->other[prio], skb);
+ sch->q.qlen++;
+ return 1;
+}
+
+static __inline__ struct sk_buff *
+skb_dequeue_best(struct csz_sched_data * q)
+{
+ int i;
+ struct sk_buff *skb;
+
+ for (i=0; i<4; i++) {
+ skb = skb_dequeue(&q->other[i]);
+ if (skb) {
+ q->flow[0].q.qlen--;
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+static __inline__ struct sk_buff *
+skb_peek_best(struct csz_sched_data * q)
+{
+ int i;
+ struct sk_buff *skb;
+
+ for (i=0; i<4; i++) {
+ skb = skb_peek(&q->other[i]);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+#ifdef CSZ_PLUS_TBF
+
+static void csz_watchdog(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc*)arg;
+ struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
+
+ q->wd_timer.expires = 0;
+ q->wd_timer.function = NULL;
+
+ qdisc_wakeup(sch->dev);
+}
+
+static __inline__ void
+csz_move_queue(struct csz_flow *this, long delta)
+{
+ this->fprev->fnext = this->fnext;
+ this->fnext->fprev = this->fprev;
+
+ this->start += delta;
+ this->finish += delta;
+
+ csz_insert_finish(this);
+}
+
+static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
+ struct csz_flow *this,
+ struct sk_buff *skb)
+{
+ long toks;
+ long shift;
+ psched_time_t now;
+
+ PSCHED_GET_TIME(now);
+
+ toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
+
+ shift = 0;
+ if (this->throttled) {
+ /* Remember aposteriory delay */
+
+ unsigned long R = csz_update(q);
+ shift = R - this->R_tbf;
+ this->R_tbf = R;
+ }
+
+ if (toks >= 0) {
+ /* Now we have enough tokens to proceed */
+
+ this->tokens = toks <= this->depth ? toks ? this->depth;
+ this->t_tbf = now;
+
+ if (!this->throttled)
+ return 1;
+
+ /* Flow was throttled. Update its start&finish numbers
+ with delay calculated aposteriori.
+ */
+
+ this->throttled = 0;
+ if (shift > 0)
+ csz_move_queue(this, shift);
+ return 1;
+ }
+
+ if (!this->throttled) {
+ /* Flow has just been throttled; remember
+ current round number to calculate aposteriori delay
+ */
+ this->throttled = 1;
+ this->R_tbf = csz_update(q);
+ }
+
+ /* Move all the queue to the time when it will be allowed to send.
+ We should translate time to round number, but it is impossible,
+ so that we made the most conservative estimate i.e. we suppose
+ that only this flow is active and, hence, R = t.
+ Really toks <= R <= toks/r_a.
+
+ This apriory shift in R will be adjusted later to reflect
+ real delay. We cannot avoid it because of:
+ - throttled flow continues to be active from the viewpoint
+ of CSZ, so that it would acquire highest priority,
+ if you not adjusted start numbers.
+ - Eventually, finish number would become less than round
+ number and flow were declared inactive.
+ */
+
+ toks = -toks;
+
+ /* Remeber, that we should start watchdog */
+ if (toks < q->wd_expires)
+ q->wd_expires = toks;
+
+ toks >>= q->R_log;
+ shift += toks;
+ if (shift > 0) {
+ this->R_tbf += toks;
+ csz_move_queue(this, shift);
+ }
+ csz_insert_start(this);
+ return 0;
+}
+#endif
+
+
+static struct sk_buff *
+csz_dequeue(struct Qdisc* sch)
+{
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ struct sk_buff *skb;
+ struct csz_flow *this;
+
+#ifdef CSZ_PLUS_TBF
+ q->wd_expires = 0;
+#endif
+ this = (struct csz_flow*)q->s.snext;
+
+ while (this != (struct csz_flow*)&q->s) {
+
+ /* First of all: unlink from start list */
+ this->sprev->snext = this->snext;
+ this->snext->sprev = this->sprev;
+
+ if (this != &q->flow[0]) { /* Guaranteed flow */
+ skb = __skb_dequeue(&this->q);
+ if (skb) {
+#ifdef CSZ_PLUS_TBF
+ if (this->depth) {
+ if (!csz_enough_tokens(q, this, skb))
+ continue;
+ }
+#endif
+ if (this->q.qlen) {
+ struct sk_buff *nskb = skb_peek(&this->q);
+ this->start += L2R(q,this,nskb->len);
+ csz_insert_start(&q->s, this);
+ }
+ sch->q.qlen--;
+ return skb;
+ }
+ } else { /* Predicted or best effort flow */
+ skb = skb_dequeue_best(q);
+ if (skb) {
+ unsigned peeked = this->peeked;
+ this->peeked = 0;
+
+ if (--this->q.qlen) {
+ struct sk_buff *nskb;
+ unsigned dequeued = L2R(q,this,skb->len);
+
+ /* We got not the same thing that
+ peeked earlier; adjust start number
+ */
+ if (peeked != dequeued && peeked)
+ this->start += dequeued - peeked;
+
+ nskb = skb_peek_best(q);
+ peeked = L2R(q,this,nskb->len);
+ this->start += peeked;
+ this->peeked = peeked;
+ csz_insert_start(&q->s, this);
+ }
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ }
+#ifdef CSZ_PLUS_TBF
+ /* We are about to return no skb.
+ Schedule watchdog timer, if it occured because of shaping.
+ */
+ if (q->wd_expires) {
+ if (q->wd_timer.function)
+ del_timer(&q->wd_timer);
+ q->wd_timer.function = csz_watchdog;
+ q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires);
+ add_timer(&q->wd_timer);
+ }
+#endif
+ return NULL;
+}
+
+static void
+csz_reset(struct Qdisc* sch)
+{
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ struct sk_buff *skb;
+ int i;
+
+ for (i=0; i<4; i++)
+ while ((skb=skb_dequeue(&q->other[i])) != NULL)
+ kfree_skb(skb, 0);
+
+ for (i=0; i<CSZ_MAX_GUARANTEED; i++) {
+ struct csz_flow *this = q->flow + i;
+ while ((skb = skb_dequeue(&this->q)) != NULL)
+ kfree_skb(skb, FREE_WRITE);
+ this->snext = this->sprev =
+ this->fnext = this->fprev = (struct csz_head*)this;
+ this->start = this->finish = 0;
+ }
+ q->s.snext = q->s.sprev = &q->s;
+ q->f.fnext = q->f.fprev = &q->f;
+ q->R_c = 0;
+#ifdef CSZ_PLUS_TBF
+ PSCHED_GET_TIME(&q->t_tbf);
+ q->tokens = q->depth;
+ if (q->wd_timer.function) {
+ del_timer(&q->wd_timer);
+ q->wd_timer.function = NULL;
+ }
+#endif
+ sch->q.qlen = 0;
+}
+
+static void
+csz_destroy(struct Qdisc* sch)
+{
+/*
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ int i;
+
+ for (i=0; i<4; i++)
+ qdisc_destroy(q->other[i]);
+ */
+}
+
+static int csz_init(struct Qdisc *sch, void *arg)
+{
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ struct cszinitctl *ctl = (struct cszinitctl*)arg;
+ int i;
+
+ for (i=0; i<4; i++)
+ skb_queue_head_init(&q->other[i]);
+
+ for (i=0; i<CSZ_MAX_GUARANTEED; i++) {
+ struct csz_flow *this = q->flow + i;
+ skb_queue_head_init(&this->q);
+ this->snext = this->sprev =
+ this->fnext = this->fprev = (struct csz_head*)this;
+ this->start = this->finish = 0;
+ }
+ q->s.snext = q->s.sprev = &q->s;
+ q->f.fnext = q->f.fprev = &q->f;
+ q->R_c = 0;
+#ifdef CSZ_PLUS_TBF
+ init_timer(&q->wd_timer);
+ q->wd_timer.data = (unsigned long)sch;
+#endif
+ if (ctl) {
+ if (ctl->flows != CSZ_MAX_GUARANTEED)
+ return -EINVAL;
+ q->cell_log = ctl->cell_log;
+ }
+ return 0;
+}
+
+static int csz_control(struct Qdisc *sch, struct pschedctl *gctl)
+{
+/*
+ struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
+ struct cszctl *ctl = (struct cszctl*)gctl->arg;
+ struct sk_buff *skb;
+ int i;
+
+ if (op == PSCHED_TC_ATTACH) {
+
+ }
+*/
+ return 0;
+}
+
+
+
+
+struct Qdisc_ops csz_ops =
+{
+ NULL,
+ "csz",
+ 0,
+ sizeof(struct csz_sched_data),
+ csz_enqueue,
+ csz_dequeue,
+ csz_reset,
+ csz_destroy,
+ csz_init,
+ csz_control,
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&csz_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000..8134baf16
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,179 @@
+/*
+ * net/sched/sch_fifo.c Simple FIFO "scheduler"
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* 1 band FIFO pseudo-"scheduler" */
+
+struct fifo_sched_data
+{
+ int qmaxbytes;
+ int qmaxlen;
+ int qbytes;
+};
+
+static int
+bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+
+ if (q->qbytes <= q->qmaxbytes) {
+ skb_queue_tail(&sch->q, skb);
+ q->qbytes += skb->len;
+ return 0;
+ }
+ kfree_skb(skb, FREE_WRITE);
+ return 1;
+}
+
+static struct sk_buff *
+bfifo_dequeue(struct Qdisc* sch)
+{
+ struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ skb = skb_dequeue(&sch->q);
+ if (skb)
+ q->qbytes -= skb->len;
+ return skb;
+}
+
+static void
+bfifo_reset(struct Qdisc* sch)
+{
+ struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ while((skb=skb_dequeue(&sch->q)) != NULL) {
+ q->qbytes -= skb->len;
+ kfree_skb(skb,FREE_WRITE);
+ }
+ if (q->qbytes) {
+ printk("fifo_reset: qbytes=%d\n", q->qbytes);
+ q->qbytes = 0;
+ }
+}
+
+static int
+pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
+
+ if (sch->q.qlen <= q->qmaxlen) {
+ skb_queue_tail(&sch->q, skb);
+ return 0;
+ }
+ kfree_skb(skb, FREE_WRITE);
+ return 1;
+}
+
+static struct sk_buff *
+pfifo_dequeue(struct Qdisc* sch)
+{
+ return skb_dequeue(&sch->q);
+}
+
+static void
+pfifo_reset(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+
+ while((skb=skb_dequeue(&sch->q))!=NULL)
+ kfree_skb(skb,FREE_WRITE);
+}
+
+
+static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */)
+{
+ struct fifo_sched_data *q;
+/*
+ struct device *dev = sch->dev;
+ */
+
+ q = (struct fifo_sched_data *)sch->data;
+/*
+ if (pkts<0)
+ pkts = dev->tx_queue_len;
+ if (bytes<0)
+ bytes = pkts*dev->mtu;
+ q->qmaxbytes = bytes;
+ q->qmaxlen = pkts;
+ */
+ return 0;
+}
+
+struct Qdisc_ops pfifo_ops =
+{
+ NULL,
+ "pfifo",
+ 0,
+ sizeof(struct fifo_sched_data),
+ pfifo_enqueue,
+ pfifo_dequeue,
+ pfifo_reset,
+ NULL,
+ fifo_init,
+};
+
+struct Qdisc_ops bfifo_ops =
+{
+ NULL,
+ "pfifo",
+ 0,
+ sizeof(struct fifo_sched_data),
+ bfifo_enqueue,
+ bfifo_dequeue,
+ bfifo_reset,
+ NULL,
+ fifo_init,
+};
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&pfifo_ops);
+ if (err == 0) {
+ err = register_qdisc(&bfifo_ops);
+ if (err)
+ unregister_qdisc(&pfifo_ops);
+ }
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000..83aa8d10e
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,541 @@
+/*
+ * net/sched/sch_generic.c Generic packet scheduler routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+struct Qdisc_head qdisc_head = { &qdisc_head };
+
+static struct Qdisc_ops *qdisc_base = NULL;
+
+/* NOTES.
+
+ Every discipline has two major routines: enqueue and dequeue.
+
+ ---dequeue
+
+ dequeue usually returns a skb to send. It is allowed to return NULL,
+ but it does not mean that queue is empty, it just means that
+ discipline does not want to send anything this time.
+ Queue is really empty if q->q.qlen == 0.
+ For complicated disciplines with multiple queues q->q is not
+ real packet queue, but however q->q.qlen must be valid.
+
+ ---enqueue
+
+ enqueue returns number of enqueued packets i.e. this number is 1,
+ if packet was enqueued sucessfully and <1 if something (not
+ necessary THIS packet) was dropped.
+
+ */
+
+int register_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+ if (strcmp(qops->id, q->id) == 0)
+ return -EEXIST;
+ qops->next = NULL;
+ qops->refcnt = 0;
+ *qp = qops;
+ return 0;
+}
+
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+ if (q == qops)
+ break;
+ if (!q)
+ return -ENOENT;
+ *qp = q->next;
+ return 0;
+}
+
+struct Qdisc *qdisc_lookup(int handle)
+{
+ return NULL;
+}
+
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+ in all curcumstances. It is difficult to invent anything more
+ fast or cheap.
+ */
+
+static int
+noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+{
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+}
+
+static struct sk_buff *
+noop_dequeue(struct Qdisc * qdisc)
+{
+ return NULL;
+}
+
+struct Qdisc noop_qdisc =
+{
+ { NULL },
+ noop_enqueue,
+ noop_dequeue,
+};
+
+struct Qdisc noqueue_qdisc =
+{
+ { NULL },
+ NULL,
+ NULL,
+};
+
+
+/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */
+
+static int
+pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+ const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 };
+ struct sk_buff_head *list;
+
+ list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7];
+
+ if (list->qlen <= skb->dev->tx_queue_len) {
+ skb_queue_tail(list, skb);
+ return 1;
+ }
+ qdisc->dropped++;
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+}
+
+static struct sk_buff *
+pfifo_fast_dequeue(struct Qdisc* qdisc)
+{
+ int prio;
+ struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+ struct sk_buff *skb;
+
+ for (prio = 0; prio < 3; prio++, list++) {
+ skb = skb_dequeue(list);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static void
+pfifo_fast_reset(struct Qdisc* qdisc)
+{
+ int prio;
+ struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+
+ for (prio=0; prio < 3; prio++)
+ skb_queue_purge(list+prio);
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, void *arg)
+{
+ int i;
+ struct sk_buff_head *list;
+
+ list = ((struct sk_buff_head*)qdisc->data);
+
+ for(i=0; i<3; i++)
+ skb_queue_head_init(list+i);
+
+ return 0;
+}
+
+static struct Qdisc_ops pfifo_fast_ops =
+{
+ NULL,
+ "pfifo_fast",
+ 1,
+ 3 * sizeof(struct sk_buff_head),
+ pfifo_fast_enqueue,
+ pfifo_fast_dequeue,
+ pfifo_fast_reset,
+ NULL,
+ pfifo_fast_init
+};
+
+static struct Qdisc *
+qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg)
+{
+ struct Qdisc *sch;
+ int size = sizeof(*sch) + ops->priv_size;
+
+ sch = kmalloc(size, GFP_KERNEL);
+ if (!sch)
+ return NULL;
+ memset(sch, 0, size);
+
+ skb_queue_head_init(&sch->q);
+ skb_queue_head_init(&sch->failure_q);
+ sch->ops = ops;
+ sch->enqueue = ops->enqueue;
+ sch->dequeue = ops->dequeue;
+ sch->dev = dev;
+ if (ops->init && ops->init(sch, arg))
+ return NULL;
+ ops->refcnt++;
+ return sch;
+}
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+ struct Qdisc_ops *ops = qdisc->ops;
+ if (ops) {
+ start_bh_atomic();
+ if (ops->reset)
+ ops->reset(qdisc);
+ skb_queue_purge(&qdisc->failure_q);
+ end_bh_atomic();
+ }
+}
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ struct Qdisc_ops *ops = qdisc->ops;
+ if (ops) {
+ start_bh_atomic();
+ if (ops->reset)
+ ops->reset(qdisc);
+ if (ops->destroy)
+ ops->destroy(qdisc);
+ skb_queue_purge(&qdisc->failure_q);
+ ops->refcnt--;
+ end_bh_atomic();
+ kfree(qdisc);
+ }
+}
+
+static void dev_do_watchdog(unsigned long dummy);
+
+static struct timer_list dev_watchdog =
+ { NULL, NULL, 0L, 0L, &dev_do_watchdog };
+
+static void dev_do_watchdog(unsigned long dummy)
+{
+ struct Qdisc_head *h;
+
+ for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) {
+ struct Qdisc *q = (struct Qdisc*)h;
+ struct device *dev = q->dev;
+ if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) {
+ qdisc_restart(dev);
+ }
+ }
+ dev_watchdog.expires = jiffies + 5*HZ;
+ add_timer(&dev_watchdog);
+}
+
+
+void dev_activate(struct device *dev)
+{
+ /* No queueing discipline is attached to device;
+ create default one i.e. pfifo_fast for devices,
+ which need queueing and noqueue_qdisc for
+ virtual intrfaces
+ */
+
+ if (dev->qdisc_sleeping == &noop_qdisc) {
+ if (dev->tx_queue_len) {
+ struct Qdisc *qdisc;
+ qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL);
+ if (qdisc == NULL)
+ return;
+ dev->qdisc_sleeping = qdisc;
+ } else
+ dev->qdisc_sleeping = &noqueue_qdisc;
+ }
+
+ start_bh_atomic();
+ if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
+ dev->qdisc->tx_timeo = 5*HZ;
+ dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo;
+ if (!dev_watchdog.expires) {
+ dev_watchdog.expires = jiffies + 5*HZ;
+ add_timer(&dev_watchdog);
+ }
+ }
+ end_bh_atomic();
+}
+
+void dev_deactivate(struct device *dev)
+{
+ struct Qdisc *qdisc;
+
+ start_bh_atomic();
+
+ qdisc = dev->qdisc;
+ dev->qdisc = &noop_qdisc;
+
+ qdisc_reset(qdisc);
+
+ if (qdisc->h.forw) {
+ struct Qdisc_head **hp, *h;
+
+ for (hp = &qdisc_head.forw; (h = *hp) != &qdisc_head; hp = &h->forw) {
+ if (h == &qdisc->h) {
+ *hp = h->forw;
+ break;
+ }
+ }
+ }
+
+ end_bh_atomic();
+}
+
+void dev_init_scheduler(struct device *dev)
+{
+ dev->qdisc = &noop_qdisc;
+ dev->qdisc_sleeping = &noop_qdisc;
+}
+
+void dev_shutdown(struct device *dev)
+{
+ struct Qdisc *qdisc;
+
+ start_bh_atomic();
+ qdisc = dev->qdisc_sleeping;
+ dev->qdisc_sleeping = &noop_qdisc;
+ qdisc_destroy(qdisc);
+ end_bh_atomic();
+}
+
+void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc)
+{
+ struct Qdisc *oqdisc;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ start_bh_atomic();
+ oqdisc = dev->qdisc_sleeping;
+
+ /* Destroy old scheduler */
+ if (oqdisc)
+ qdisc_destroy(oqdisc);
+
+ /* ... and attach new one */
+ dev->qdisc_sleeping = qdisc;
+ dev->qdisc = &noop_qdisc;
+ end_bh_atomic();
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+}
+
+/* Kick the queue "q".
+ Note, that this procedure is called by watchdog timer, so that
+ we do not check dev->tbusy flag here.
+
+ Returns: 0 - queue is empty.
+ >0 - queue is not empty, but throttled.
+ <0 - queue is not empty. Device is throttled, if dev->tbusy != 0.
+
+ NOTE: Called only from NET BH
+*/
+
+
+int qdisc_restart(struct device *dev)
+{
+ struct Qdisc *q = dev->qdisc;
+ struct sk_buff *skb;
+
+ skb = skb_dequeue(&q->failure_q);
+ if (!skb) {
+ skb = q->dequeue(q);
+ if (netdev_nit && skb)
+ dev_queue_xmit_nit(skb,dev);
+ }
+ if (skb) {
+ if (dev->hard_start_xmit(skb, dev) == 0) {
+ q->tx_last = jiffies;
+ return -1;
+ }
+#if 0
+ if (net_ratelimit())
+ printk(KERN_DEBUG "netdevice %s defers output.\n", dev->name);
+#endif
+ skb_queue_head(&q->failure_q, skb);
+ return -1;
+ }
+ return q->q.qlen;
+}
+
+void qdisc_run_queues(void)
+{
+ struct Qdisc_head **hp, *h;
+
+ hp = &qdisc_head.forw;
+ while ((h = *hp) != &qdisc_head) {
+ int res = -1;
+ struct Qdisc *q = (struct Qdisc*)h;
+ struct device *dev = q->dev;
+
+ while (!dev->tbusy && (res = qdisc_restart(dev)) < 0)
+ /* NOTHING */;
+
+ /* The explanation is necessary here.
+ qdisc_restart called dev->hard_start_xmit,
+ if device is virtual, it could trigger one more
+ dev_queue_xmit and new device could appear
+ in active chain. In this case we cannot unlink
+ empty queue, because we lost back pointer.
+ No problem, we will unlink it during the next round.
+ */
+
+ if (res == 0 && *hp == h) {
+ *hp = h->forw;
+ h->forw = NULL;
+ continue;
+ }
+ hp = &h->forw;
+ }
+}
+
+
+int tc_init(struct pschedctl *pctl)
+{
+ struct Qdisc *q;
+ struct Qdisc_ops *qops;
+
+ if (pctl->handle) {
+ q = qdisc_lookup(pctl->handle);
+ if (q == NULL)
+ return -ENOENT;
+ qops = q->ops;
+ if (pctl->ifindex && q->dev->ifindex != pctl->ifindex)
+ return -EINVAL;
+ }
+ return -EINVAL;
+}
+
+int tc_destroy(struct pschedctl *pctl)
+{
+ return -EINVAL;
+}
+
+int tc_attach(struct pschedctl *pctl)
+{
+ return -EINVAL;
+}
+
+int tc_detach(struct pschedctl *pctl)
+{
+ return -EINVAL;
+}
+
+
+int psched_ioctl(void *arg)
+{
+ struct pschedctl ctl;
+ struct pschedctl *pctl = &ctl;
+ int err;
+
+ if (copy_from_user(&ctl, arg, sizeof(ctl)))
+ return -EFAULT;
+
+ if (ctl.arglen > 0) {
+ pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL);
+ if (pctl == NULL)
+ return -ENOBUFS;
+ memcpy(pctl, &ctl, sizeof(ctl));
+ if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) {
+ kfree(pctl);
+ return -EFAULT;
+ }
+ }
+
+ rtnl_lock();
+
+ switch (ctl.command) {
+ case PSCHED_TC_INIT:
+ err = tc_init(pctl);
+ break;
+ case PSCHED_TC_DESTROY:
+ err = tc_destroy(pctl);
+ break;
+ case PSCHED_TC_ATTACH:
+ err = tc_attach(pctl);
+ break;
+ case PSCHED_TC_DETACH:
+ err = tc_detach(pctl);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ rtnl_unlock();
+
+ if (pctl != &ctl)
+ kfree(pctl);
+ return err;
+}
+
+__initfunc(int pktsched_init(void))
+{
+#define INIT_QDISC(name) { \
+ extern struct Qdisc_ops name##_ops; \
+ register_qdisc(&##name##_ops); \
+ }
+
+ skb_queue_head_init(&noop_qdisc.failure_q);
+ skb_queue_head_init(&noqueue_qdisc.failure_q);
+
+ register_qdisc(&pfifo_fast_ops);
+#ifdef CONFIG_NET_SCH_CBQ
+ INIT_QDISC(cbq);
+#endif
+#ifdef CONFIG_NET_SCH_CSZ
+ INIT_QDISC(csz);
+#endif
+#ifdef CONFIG_NET_SCH_RED
+ INIT_QDISC(red);
+#endif
+#ifdef CONFIG_NET_SCH_SFQ
+ INIT_QDISC(sfq);
+#endif
+#ifdef CONFIG_NET_SCH_TBF
+ INIT_QDISC(tbf);
+#endif
+#ifdef CONFIG_NET_SCH_PFIFO
+ INIT_QDISC(pfifo);
+ INIT_QDISC(bfifo);
+#endif
+#ifdef CONFIG_NET_SCH_PRIO
+ INIT_QDISC(prio);
+#endif
+ return 0;
+}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000..a3806eda4
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,146 @@
+/*
+ * net/sched/sch_prio.c Simple 3-band priority "scheduler".
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* New N-band generic scheduler */
+
+struct prio_sched_data
+{
+ int qbytes;
+ int bands;
+ u8 prio2band[8];
+ struct Qdisc *queues[8];
+};
+
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+ int prio = q->prio2band[skb->priority&7];
+ struct Qdisc *qdisc;
+
+ qdisc = q->queues[prio];
+ if (qdisc->enqueue(skb, qdisc) == 0) {
+ q->qbytes += skb->len;
+ sch->q.qlen++;
+ return 0;
+ }
+ return 1;
+}
+
+static struct sk_buff *
+prio_dequeue(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+ int prio;
+ struct Qdisc *qdisc;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ qdisc = q->queues[prio];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ q->qbytes -= skb->len;
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ return NULL;
+
+}
+
+static void
+prio_reset(struct Qdisc* sch)
+{
+ int prio;
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+
+ for (prio=0; prio<q->bands; prio++)
+ qdisc_reset(q->queues[prio]);
+ q->qbytes = 0;
+}
+
+static void
+prio_destroy(struct Qdisc* sch)
+{
+ int prio;
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+
+ for (prio=0; prio<q->bands; prio++) {
+ qdisc_destroy(q->queues[prio]);
+ q->queues[prio] = &noop_qdisc;
+ }
+}
+
+static int prio_init(struct Qdisc *sch, void *arg)
+{
+ const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 };
+ struct prio_sched_data *q;
+ int i;
+
+ q = (struct prio_sched_data *)sch->data;
+ q->bands = 3;
+ memcpy(q->prio2band, prio2band, sizeof(prio2band));
+ for (i=0; i<q->bands; i++)
+ q->queues[i] = &noop_qdisc;
+ return 0;
+}
+
+struct Qdisc_ops prio_ops =
+{
+ NULL,
+ "prio",
+ 0,
+ sizeof(struct prio_sched_data),
+ prio_enqueue,
+ prio_dequeue,
+ prio_reset,
+ prio_destroy,
+ prio_init,
+};
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&prio_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000..fd3ee43ac
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,303 @@
+/*
+ * net/sched/sch_red.c Random Early Detection scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/* Random Early Detection (RED) algorithm.
+ =======================================
+
+ Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
+ for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
+
+ This file codes a "divisionless" version of RED algorithm
+ written down in Fig.17 of the paper.
+
+Short description.
+------------------
+
+ When new packet arrives we calculate average queue length:
+
+ avg = (1-W)*avg + W*current_queue_len,
+
+ W is filter time constant (choosen as 2^(-Wlog)), controlling
+ inertia of algorithm. To allow larger bursts, W should be
+ decreased.
+
+ if (avg > th_max) -> packet marked (dropped).
+ if (avg < th_min) -> packet passes.
+ if (th_min < avg < th_max) we calculate probability:
+
+ Pb = max_P * (avg - th_min)/(th_max-th_min)
+
+ and mark (drop) packet with this probability.
+ Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
+ max_P should be small (not 1!).
+
+ NB. SF&VJ assumed that Pb[avg] is linear function. I think it
+ is wrong. I'd make:
+ P[th_min] = 0, P[th_max] = 1;
+ dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number.
+
+ I choose max_P as a number between 0.01 and 0.1, so that
+ C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log)
+
+ Parameters, settable by user (with default values):
+
+ qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max
+ to allow packet bursts. This parameter does not
+ affect algorithm behaviour and can be chosen
+ arbitrarily high (well, less than ram size)
+ Really, this limit will never be achieved
+ if RED works correctly.
+ qth_min=32K
+ qth_max=128K - qth_max should be at least 2*qth_min
+ Wlog=8 - log(1/W).
+ Alog=Wlog - fixed point position in th_min and th_max.
+ Rlog=10
+ C1log=24 - C1log = trueC1log+Alog-Rlog
+ so that trueC1log=22 and max_P~0.02
+
+
+NOTES:
+
+Upper bound on W.
+-----------------
+
+ If you want to allow bursts of L packets of size S,
+ you should choose W:
+
+ L + 1 -th_min/S < (1-(1-W)^L)/W
+
+ For th_min/S = 32
+
+ log(W) L
+ -1 33
+ -2 35
+ -3 39
+ -4 46
+ -5 57
+ -6 75
+ -7 101
+ -8 135
+ -9 190
+ etc.
+ */
+
+struct red_sched_data
+{
+/* Parameters */
+ unsigned long qmaxbytes; /* HARD maximal queue length */
+ unsigned long qth_min; /* Min average length threshold: A scaled */
+ unsigned long qth_max; /* Max average length threshold: A scaled */
+ char Alog; /* Point position in average lengths */
+ char Wlog; /* log(W) */
+ char Rlog; /* random number bits */
+ char C1log; /* log(1/C1) */
+ char Slog;
+ char Stab[256];
+
+/* Variables */
+ unsigned long qbytes; /* Queue length in bytes */
+ unsigned long qave; /* Average queue length: A scaled */
+ int qcount; /* Packets since last random number generation */
+ unsigned qR; /* Cached random number [0..1<Rlog) */
+ psched_time_t qidlestart; /* Start of idle period */
+};
+
+/* Stolen from igmp.c. */
+
+static __inline__ unsigned red_random(int log)
+{
+ static unsigned long seed=152L;
+ seed=seed*69069L+1;
+ return (seed^jiffies)&((1<<log)-1);
+}
+
+static int
+red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct red_sched_data *q = (struct red_sched_data *)sch->data;
+
+ psched_time_t now;
+
+ if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+ long us_idle;
+ PSCHED_SET_PASTPERFECT(q->qidlestart);
+ PSCHED_GET_TIME(now);
+ us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<<q->Slog)-1, 0);
+
+/* It is wrong, but I do not think that SF+VJ proposal is reasonable
+ and did not invented anything more clever 8)
+
+ The problem: ideally, average length queue recalcultion should
+ be done over constant clock intervals. It is too expensive, so that
+ calculation is driven by outgoing packets.
+ When queue is idle we have to model this clock by hands.
+
+ SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size
+ dummy packets as burst after idle time, i.e.
+
+ q->qave *= (1-W)^m
+
+ It is apparently overcomplicated solution (f.e. we have to precompute
+ a table to make this calculation for reasonable time)
+ I believe, that a simpler model may be used here,
+ but it is field for experiments.
+*/
+ q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF];
+ }
+
+ q->qave += ((q->qbytes<<q->Alog) - q->qave) >> q->Wlog;
+
+ if (q->qave < q->qth_min) {
+enqueue:
+ q->qcount = -1;
+ if (q->qbytes <= q->qmaxbytes) {
+ skb_queue_tail(&sch->q, skb);
+ q->qbytes += skb->len;
+ return 1;
+ }
+drop:
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+ }
+ if (q->qave >= q->qth_max) {
+ q->qcount = -1;
+ goto drop;
+ }
+ q->qcount++;
+ if (q->qcount++) {
+ if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR)
+ goto enqueue;
+ q->qcount = 0;
+ q->qR = red_random(q->Rlog);
+ goto drop;
+ }
+ q->qR = red_random(q->Rlog);
+ goto enqueue;
+}
+
+static struct sk_buff *
+red_dequeue(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+ struct red_sched_data *q = (struct red_sched_data *)sch->data;
+
+ skb = skb_dequeue(&sch->q);
+ if (skb) {
+ q->qbytes -= skb->len;
+ return skb;
+ }
+ PSCHED_GET_TIME(q->qidlestart);
+ return NULL;
+}
+
+static void
+red_reset(struct Qdisc* sch)
+{
+ struct red_sched_data *q = (struct red_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ while((skb=skb_dequeue(&sch->q))!=NULL) {
+ q->qbytes -= skb->len;
+ kfree_skb(skb,FREE_WRITE);
+ }
+ if (q->qbytes) {
+ printk("red_reset: qbytes=%lu\n", q->qbytes);
+ q->qbytes = 0;
+ }
+ PSCHED_SET_PASTPERFECT(q->qidlestart);
+ q->qave = 0;
+ q->qcount = -1;
+}
+
+static int red_init(struct Qdisc *sch, struct pschedctl *pctl)
+{
+ struct red_sched_data *q;
+ struct redctl *ctl = (struct redctl*)pctl->args;
+
+ q = (struct red_sched_data *)sch->data;
+
+ if (pctl->arglen < sizeof(struct redctl))
+ return -EINVAL;
+
+ q->Wlog = ctl->Wlog;
+ q->Alog = ctl->Alog;
+ q->Rlog = ctl->Rlog;
+ q->C1log = ctl->C1log;
+ q->Slog = ctl->Slog;
+ q->qth_min = ctl->qth_min;
+ q->qth_max = ctl->qth_max;
+ q->qmaxbytes = ctl->qmaxbytes;
+ memcpy(q->Stab, ctl->Stab, 256);
+
+ q->qcount = -1;
+ PSCHED_SET_PASTPERFECT(q->qidlestart);
+ return 0;
+}
+
+struct Qdisc_ops red_ops =
+{
+ NULL,
+ "red",
+ 0,
+ sizeof(struct red_sched_data),
+ red_enqueue,
+ red_dequeue,
+ red_reset,
+ NULL,
+ red_init,
+ NULL
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&red_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000..65c3906b4
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,333 @@
+/*
+ * net/sched/sch_sfq.c Stochastic Fairness Queueing scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/* Stochastic Fairness Queuing algorithm.
+ =======================================
+
+ Source:
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ "Interworking: Research and Experience", v.2, 1991, p.113-131.
+
+
+ See also:
+ M. Shreedhar and George Varghese "Efficient Fair
+ Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+
+
+ It is not the thing that usually called (W)FQ nowadays. It does not
+ use any timestamp mechanism, but instead processes queues
+ in round-robin order.
+
+ ADVANTAGE:
+
+ - It is very cheap. Both CPU and memory requirements are minimal.
+
+ DRAWBACKS:
+
+ - "Stochastic" -> It is not 100% fair.
+ When hash collisions occur, several flows are considred as one.
+
+ - "Round-robin" -> It introduces larger delays than virtual clock
+ based schemes, and should not be used for isolation interactive
+ traffic from non-interactive. It means, that this scheduler
+ should be used as leaf of CBQ or P3, which put interactive traffic
+ to higher priority band.
+
+ We still need true WFQ for top level CSZ, but using WFQ
+ for the best effort traffic is absolutely pointless:
+ SFQ is superior for this purpose.
+
+ IMPLEMENTATION:
+ This implementation limits maximal queue length to 128;
+ maximal mtu to 2^15-1; number of hash buckets to 1024.
+ The only goal of this restrictions was that all data
+ fitted to one 4K page :-). Struct sfq_sched_data is
+ organized in anti-cache manner: all the data for bucket
+ scattered over different locations. It is not good,
+ but it allowed to put it into 4K.
+
+ It is easy to increase these values.
+*/
+
+#define SFQ_DEPTH 128
+#define SFQ_HASH_DIVISOR 1024
+
+#define SFQ_HASH(a) 0
+
+/* This type should contain at least SFQ_DEPTH*2 values */
+typedef unsigned char sfq_index;
+
+struct sfq_head
+{
+ sfq_index next;
+ sfq_index prev;
+};
+
+struct sfq_sched_data
+{
+/* Parameters */
+ unsigned quantum; /* Allotment per round: MUST BE >= MTU */
+
+/* Variables */
+ sfq_index tail; /* Index of current slot in round */
+ sfq_index max_depth; /* Maximal depth */
+
+ sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
+ sfq_index next[SFQ_DEPTH]; /* Active slots link */
+ short allot[SFQ_DEPTH]; /* Current allotment per slot */
+ unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
+ struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
+ struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
+};
+
+extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d = q->qs[x].qlen;
+
+ p = d;
+ n = q->dep[d].next;
+ q->dep[x].next = n;
+ q->dep[x].prev = p;
+ q->dep[p].next = q->dep[n].prev = x;
+}
+
+extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+
+ n = q->dep[x].next;
+ p = q->dep[x].prev;
+ q->dep[p].next = n;
+ q->dep[n].prev = p;
+
+ if (n == p && q->max_depth == q->qs[x].qlen + 1)
+ q->max_depth--;
+
+ sfq_link(q, x);
+}
+
+extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d;
+
+ n = q->dep[x].next;
+ p = q->dep[x].prev;
+ q->dep[p].next = n;
+ q->dep[n].prev = p;
+ d = q->qs[x].qlen;
+ if (q->max_depth < d)
+ q->max_depth = d;
+
+ sfq_link(q, x);
+}
+
+static __inline__ void sfq_drop(struct sfq_sched_data *q)
+{
+ struct sk_buff *skb;
+ sfq_index d = q->max_depth;
+
+ /* Queue is full! Find the longest slot and
+ drop a packet from it */
+
+ if (d != 1) {
+ sfq_index x = q->dep[d].next;
+ skb = q->qs[x].prev;
+ __skb_unlink(skb, &q->qs[x]);
+ kfree_skb(skb, FREE_WRITE);
+ sfq_dec(q, x);
+/*
+ sch->q.qlen--;
+ */
+ return;
+ }
+
+ /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+
+ d = q->next[q->tail];
+ q->next[q->tail] = q->next[d];
+ q->allot[q->next[d]] += q->quantum;
+ skb = q->qs[d].prev;
+ __skb_unlink(skb, &q->qs[d]);
+ kfree_skb(skb, FREE_WRITE);
+ sfq_dec(q, d);
+/*
+ sch->q.qlen--;
+ */
+ q->ht[q->hash[d]] = SFQ_DEPTH;
+ return;
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+ unsigned hash = SFQ_HASH(skb);
+ sfq_index x;
+
+ x = q->ht[hash];
+ if (x == SFQ_DEPTH) {
+ q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
+ q->hash[x] = hash;
+ }
+ __skb_queue_tail(&q->qs[x], skb);
+ sfq_inc(q, x);
+ if (q->qs[x].qlen == 1) { /* The flow is new */
+ if (q->tail == SFQ_DEPTH) { /* It is the first flow */
+ q->tail = x;
+ q->next[x] = x;
+ q->allot[x] = q->quantum;
+ } else {
+ q->next[x] = q->next[q->tail];
+ q->next[q->tail] = x;
+ q->tail = x;
+ }
+ }
+ if (++sch->q.qlen < SFQ_DEPTH-1)
+ return 1;
+
+ sfq_drop(q);
+ return 0;
+}
+
+static struct sk_buff *
+sfq_dequeue(struct Qdisc* sch)
+{
+ struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+ struct sk_buff *skb;
+ sfq_index a, old_a;
+
+ /* No active slots */
+ if (q->tail == SFQ_DEPTH)
+ return NULL;
+
+ a = old_a = q->next[q->tail];
+
+ /* Grab packet */
+ skb = __skb_dequeue(&q->qs[a]);
+ sfq_dec(q, a);
+ sch->q.qlen--;
+
+ /* Is the slot empty? */
+ if (q->qs[a].qlen == 0) {
+ a = q->next[a];
+ if (a == old_a) {
+ q->tail = SFQ_DEPTH;
+ return skb;
+ }
+ q->next[q->tail] = a;
+ q->allot[a] += q->quantum;
+ } else if ((q->allot[a] -= skb->len) <= 0) {
+ q->tail = a;
+ a = q->next[a];
+ q->allot[a] += q->quantum;
+ }
+ return skb;
+}
+
+static void
+sfq_reset(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = sfq_dequeue(sch)) != NULL)
+ kfree_skb(skb, FREE_WRITE);
+}
+
+
+static int sfq_open(struct Qdisc *sch, void *arg)
+{
+ struct sfq_sched_data *q;
+ int i;
+
+ q = (struct sfq_sched_data *)sch->data;
+
+ for (i=0; i<SFQ_HASH_DIVISOR; i++)
+ q->ht[i] = SFQ_DEPTH;
+ for (i=0; i<SFQ_DEPTH; i++) {
+ skb_queue_head_init(&q->qs[i]);
+ q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
+ q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
+ }
+ q->max_depth = 0;
+ q->tail = SFQ_DEPTH;
+ q->quantum = sch->dev->mtu;
+ if (sch->dev->hard_header)
+ q->quantum += sch->dev->hard_header_len;
+ for (i=0; i<SFQ_DEPTH; i++)
+ sfq_link(q, i);
+ return 0;
+}
+
+
+struct Qdisc_ops sfq_ops =
+{
+ NULL,
+ "sfq",
+ 0,
+ sizeof(struct sfq_sched_data),
+ sfq_enqueue,
+ sfq_dequeue,
+ sfq_reset,
+ NULL,
+ sfq_open,
+};
+
+#ifdef MODULE
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&sfq_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000..9869af1d3
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,252 @@
+/*
+ * net/sched/sch_tbf.c Token Bucket Filter.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/* Simple Token Bucket Filter.
+ =======================================
+
+ SOURCE.
+
+ None.
+
+ ALGORITHM.
+
+ Sequence of packets satisfy token bucket filter with
+ rate $r$ and depth $b$, if all the numbers defined by:
+ \begin{eqnarray*}
+ n_0 &=& b, \\
+ n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ),
+ \end{eqnarray*}
+ where $t_i$ --- departure time of $i$-th packet and
+ $L_i$ -- its length, never less than zero.
+
+ It is convenient to rescale $n_i$ by factor $r$, so
+ that the sequence has "canonical" form:
+ \[
+ n_0 = b/r,
+ n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r },
+ \]
+
+ If a packet has n_i < 0, we throttle filter
+ by $-n_i$ usecs.
+
+ NOTES.
+
+ If TBF throttles, it starts watchdog timer, which will wake up it
+ after 0...10 msec.
+ If no new packets will arrive during this period,
+ or device will not be awaken by EOI for previous packet,
+ tbf could stop its activity for 10 msec.
+
+ It means that tbf will sometimes introduce pathological
+ 10msec delays to flow corresponding to rate*10msec bytes.
+ For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb.
+ This number puts lower reasonbale bound on token bucket depth,
+ but even if depth is larger traffic is erratic at large rates.
+
+ This problem is not specific for THIS implementation. Really,
+ there exists statement that any attempt to shape traffic
+ in transit will increase delays and jitter much more than
+ we expected naively.
+
+ Particularily, it means that delay/jitter sensitive traffic
+ MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches.
+*/
+
+struct tbf_sched_data
+{
+/* Parameters */
+ int cell_log; /* 1<<cell_log is quantum of packet size */
+ unsigned long L_tab[256]; /* Lookup table for L/B values */
+ unsigned long depth; /* Token bucket depth/B: MUST BE >= MTU/B */
+ unsigned long max_bytes; /* Maximal length of backlog: bytes */
+
+/* Variables */
+ unsigned long bytes; /* Current length of backlog */
+ unsigned long tokens; /* Current number of tokens */
+ psched_time_t t_c; /* Time check-point */
+ struct timer_list wd_timer; /* Watchdog timer */
+};
+
+#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log])
+
+static int
+tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+ __skb_queue_tail(&sch->q, skb);
+ if ((q->bytes += skb->len) <= q->max_bytes)
+ return 1;
+
+ /* Drop action: undo the things that we just made,
+ * i.e. make tail drop
+ */
+
+ __skb_unlink(skb, &sch->q);
+ q->bytes -= skb->len;
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+}
+
+static void tbf_watchdog(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc*)arg;
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+ q->wd_timer.function = NULL;
+
+ qdisc_wakeup(sch->dev);
+}
+
+
+static struct sk_buff *
+tbf_dequeue(struct Qdisc* sch)
+{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ skb = __skb_dequeue(&sch->q);
+
+ if (skb) {
+ psched_time_t now;
+ long toks;
+
+ PSCHED_GET_TIME(now);
+
+ toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0)
+ + q->tokens - L2T(q,skb->len);
+
+ if (toks >= 0) {
+ q->t_c = now;
+ q->tokens = toks <= q->depth ? toks : q->depth;
+ q->bytes -= skb->len;
+ return skb;
+ }
+
+ /* Maybe, we have in queue a shorter packet,
+ which can be sent now. It sounds cool,
+ but, however, wrong in principle.
+ We MUST NOT reorder packets in these curcumstances.
+
+ Really, if we splitted flow to independent
+ subflows, it would be very good solution.
+ Look at sch_csz.c.
+ */
+ __skb_queue_head(&sch->q, skb);
+
+ if (!sch->dev->tbusy) {
+ if (q->wd_timer.function)
+ del_timer(&q->wd_timer);
+ q->wd_timer.function = tbf_watchdog;
+ q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks);
+ add_timer(&q->wd_timer);
+ }
+ }
+ return NULL;
+}
+
+
+static void
+tbf_reset(struct Qdisc* sch)
+{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sch->q)) != NULL)
+ kfree_skb(skb, FREE_WRITE);
+ q->bytes = 0;
+ PSCHED_GET_TIME(q->t_c);
+ q->tokens = q->depth;
+ if (q->wd_timer.function) {
+ del_timer(&q->wd_timer);
+ q->wd_timer.function = NULL;
+ }
+}
+
+static int tbf_init(struct Qdisc* sch, void *arg)
+{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+ struct tbfctl *ctl = (struct tbfctl*)arg;
+
+ PSCHED_GET_TIME(q->t_c);
+ init_timer(&q->wd_timer);
+ q->wd_timer.function = NULL;
+ q->wd_timer.data = (unsigned long)sch;
+ if (ctl) {
+ q->max_bytes = ctl->bytes;
+ q->depth = ctl->depth;
+ q->tokens = q->tokens;
+ q->cell_log = ctl->cell_log;
+ memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long));
+ }
+ return 0;
+}
+
+struct Qdisc_ops tbf_ops =
+{
+ NULL,
+ "tbf",
+ 0,
+ sizeof(struct tbf_sched_data),
+ tbf_enqueue,
+ tbf_dequeue,
+ tbf_reset,
+ NULL,
+ tbf_init,
+ NULL,
+};
+
+
+#ifdef MODULE
+#include <linux/module.h>
+int init_module(void)
+{
+ int err;
+
+ /* Load once and never free it. */
+ MOD_INC_USE_COUNT;
+
+ err = register_qdisc(&tbf_ops);
+ if (err)
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+void cleanup_module(void)
+{
+}
+#endif