diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-12-16 06:06:25 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1997-12-16 06:06:25 +0000 |
commit | aa944aa3453e47706685bc562711a9e87375941e (patch) | |
tree | 8fb37a65f205a90412917ca2b91c429263ef1790 /net/sched | |
parent | 967c65a99059fd459b956c1588ce0ba227912c4e (diff) |
Merge with Linux 2.1.72, part 2.
The new signal code with exception of the code for the rt signals.
The definitions in <asm/siginfo.h> and <asm/ucontext.h> are currently
just stolen from the Alpha and will need to be overhauled.
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/.cvsignore | 1 | ||||
-rw-r--r-- | net/sched/Makefile | 71 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 839 | ||||
-rw-r--r-- | net/sched/sch_csz.c | 832 | ||||
-rw-r--r-- | net/sched/sch_fifo.c | 179 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 541 | ||||
-rw-r--r-- | net/sched/sch_prio.c | 146 | ||||
-rw-r--r-- | net/sched/sch_red.c | 303 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 333 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 252 |
10 files changed, 3497 insertions, 0 deletions
diff --git a/net/sched/.cvsignore b/net/sched/.cvsignore new file mode 100644 index 000000000..4671378ae --- /dev/null +++ b/net/sched/.cvsignore @@ -0,0 +1 @@ +.depend diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 000000000..cbb6704c1 --- /dev/null +++ b/net/sched/Makefile @@ -0,0 +1,71 @@ +# +# Makefile for the Linux Traffic Control Unit. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := sched.o + +O_OBJS := sch_generic.o + +ifeq ($(CONFIG_NET_SCH_CBQ), y) +O_OBJS += sch_cbq.o +else + ifeq ($(CONFIG_NET_SCH_CBQ), m) + M_OBJS += sch_cbq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_CSZ), y) +O_OBJS += sch_csz.o +else + ifeq ($(CONFIG_NET_SCH_CSZ), m) + M_OBJS += sch_csz.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_SFQ), y) +O_OBJS += sch_sfq.o +else + ifeq ($(CONFIG_NET_SCH_SFQ), m) + M_OBJS += sch_sfq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_RED), y) +O_OBJS += sch_red.o +else + ifeq ($(CONFIG_NET_SCH_RED), m) + M_OBJS += sch_red.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_TBF), y) +O_OBJS += sch_tbf.o +else + ifeq ($(CONFIG_NET_SCH_TBF), m) + M_OBJS += sch_tbf.o + endif +endif + + +ifeq ($(CONFIG_NET_SCH_PFIFO), y) +O_OBJS += sch_fifo.o +else + ifeq ($(CONFIG_NET_SCH_PFIFO), m) + M_OBJS += sch_fifo.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_PRIO), y) +O_OBJS += sch_prio.o +else + ifeq ($(CONFIG_NET_SCH_PRIO), m) + M_OBJS += sch_prio.o + endif +endif + +include $(TOPDIR)/Rules.make diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 000000000..626afe555 --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,839 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + Algorithm skeleton is taken from from NS simulator cbq.cc. + + ----------------------------------------------------------------------- + + Differences from NS version. + + --- WRR algorith is different. Our version looks more reasonable :-) + and fair when quanta are allowed to be less than MTU. + + --- cl->aveidle is REALLY limited from below by cl->minidle. + Seems, it was bug in NS. + + --- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel". + When depth increases we expect, that the thing becomes lower, does not it? :-) + Besides that, "depth" word is semantically overloaded --- + "token bucket depth", "sfq depth"... Besides that, the algorithm + was called "top-LEVEL sharing". + + PROBLEM. + + --- Linux has no EOI event at the moment, so that we cannot + estimate true class idle time. Three workarounds are possible, + all of them have drawbacks: + + 1. (as now) Consider the next dequeue event as sign that + previous packet is finished. It is wrong because of ping-pong + buffers, but on permanently loaded link it is true. + 2. (NS approach) Use as link busy time estimate skb->leb/"physical + bandwidth". Even more wrong f.e. on ethernet real busy time much + higher because of collisions. + 3. (seems, the most clever) Split net bh to two parts: + NETRX_BH (for received packets) and preserve NET_BH for transmitter. + It will not require driver changes (NETRX_BH flag will be set + in netif_rx), but will allow to trace EOIs more precisely + and will save useless checks in net_bh. Besides that we will + have to eliminate random calling hard_start_xmit with dev->tbusy flag + (done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit + MUST succeed; failed packets will be dropped on the floor. +*/ + +#define CBQ_TOPLEVEL_SHARING +/* #define CBQ_NO_TRICKERY */ + +#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class) + +struct cbq_class +{ +/* Parameters */ + int priority; /* priority */ +#ifdef CBQ_TOPLEVEL_SHARING + int level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of childrens + 1 for nodes. + */ +#endif + + long maxidle; /* Class paramters: see below. */ + long minidle; + int filter_log; +#ifndef CBQ_NO_TRICKERY + long extradelay; +#endif + + long quantum; /* Allotment per WRR round */ + long rquantum; /* Relative allotment: see below */ + + int cell_log; + unsigned long L_tab[256]; + + struct Qdisc *qdisc; /* ptr to CBQ discipline */ + struct cbq_class *root; /* Ptr to root class; + root can be not unique. + */ + struct cbq_class *parent; /* Ptr to parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + + struct Qdisc *q; /* Elementary queueing discipline */ + struct cbq_class *next; /* next class in this priority band */ + + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Variables */ + psched_time_t last; + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + char awake; /* Class is in alive list */ + +#if 0 + void (*overlimit)(struct cbq_class *cl); +#endif +}; + +#define L2T(cl,len) ((cl)->L_tab[(len)>>(cl)->cell_log]) + +struct cbq_sched_data +{ + struct cbq_class *classes[CBQ_MAXPRIO]; /* List of all classes */ + int nclasses[CBQ_MAXPRIO]; + unsigned quanta[CBQ_MAXPRIO]; + unsigned mtu; + int cell_log; + unsigned long L_tab[256]; + struct cbq_class *fallback_class; + + unsigned activemask; + struct cbq_class *active[CBQ_MAXPRIO]; /* List of all classes + with backlog */ + struct cbq_class *last_sent; + int last_sent_len; + + psched_time_t now; /* Cached timestamp */ + + struct timer_list wd_timer; /* Wathchdog timer, that + started when CBQ has + backlog, but cannot + transmit just now */ + unsigned long wd_expires; +#ifdef CBQ_TOPLEVEL_SHARING + struct cbq_class *borrowed; + int toplevel; +#endif +}; + +/* + WRR quanta + ---------- + + cl->quantum is number added to class allotment on every round. + cl->rquantum is "relative" quantum. + + For real-time classes: + + cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio] + + where q->quanta[prio] is sum of all rquanta for given priority. + cl->rquantum can be identified with absolute rate of the class + in arbitrary units (f.e. bytes/sec) + + In this case, delay introduced by round-robin was estimated by + Sally Floyd [2] as: + + D = q->nclasses*q->mtu/(bandwidth/2) + + Note, that D does not depend on class rate (it is very bad), + but not much worse than Gallager-Parekh estimate for CSZ + C/R = q->mtu/rate, when real-time classes have close rates. + + For not real-time classes this folmula is not necessary, + so that cl->quantum can be set to any reasonable not zero value. + Apparently, it should be proportional to class rate, if the + rate is not zero. +*/ + +/* + maxidle, minidle, extradelay + ---------------------------- + + CBQ estimator calculates smoothed class idle time cl->aveidle, + considering class as virtual interface with corresponding bandwidth. + When cl->aveidle wants to be less than zero, class is overlimit. + When it is positive, class is underlimit. + + * maxidle bounds aveidle from above. + It controls maximal length of burst in this class after + long period of idle time. Burstness of active class + is controlled by filter constant cl->filter_log, + but this number is related to burst length only indirectly. + + * minidle is a negative number, normally set to zero. + Setting it to not zero value allows avgidle to drop + below zero, effectively penalizing class, when it is overlimit. + When the class load will decrease, it will take a time to + raise negative avgidle to put the class at limit. + It should be set to zero for leaf classes. + + * extradelay is penalty in delay, when a class goes overlimit. + I believe this parameter is useless and confusing. + Setting it to not zero forces class to accumulate + its "idleness" for extradelay and then send BURST of packets + until going to overlimit again. Non-sense. + + For details see [1] and [3]. + + Really, minidle and extradelay are irrelevant to real scheduling + task. As I understand, SF&VJ introduced them to experiment + with CBQ simulator in attempts to fix erratic behaviour + of ancestor-only (and, partially, top-level) algorithm. + + WARNING. + + User passes them measured in usecs, but cl->minidle, + cl->maxidle and cl->aveidle are scaled with cl->filter_log + in the text of the scheduler. +*/ + +/* + A packet has just been enqueued on the empty class. + cbq_wakeup_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_wakeup_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + int prio = cl->priority; + struct cbq_class *cl_tail; + + cl->awake = 1; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl->deficit = 0; + } else { + cl->next_alive = cl; + q->activemask |= (1<<prio); + cl->deficit = cl->quantum; + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = CBQ_CLASSIFIER(skb, q); + + if (cl->q->enqueue(skb, cl->q) == 1) { + sch->q.qlen++; + +#ifdef CBQ_TOPLEVEL_SHARING + if (q->toplevel > 0) { + psched_time_t now; + PSCHED_GET_TIME(now); + if (PSCHED_TLESS(cl->undertime, now)) + q->toplevel = 0; + else if (q->toplevel > 1 && cl->borrow && + PSCHED_TLESS(cl->borrow->undertime, now)) + q->toplevel = 1; + } +#endif + if (!cl->awake) + cbq_wakeup_class(cl); + return 1; + } + return 0; +} + +static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl) +{ + long delay; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + if (q->wd_expires == 0 || q->wd_expires - delay > 0) + q->wd_expires = delay; +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + qdisc_wakeup(sch->dev); +} + +static __inline__ void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *cl; + + for (cl = q->last_sent; cl; cl = cl->parent) { + long avgidle = cl->avgidle; + long idle; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last) + - L2T(cl, q->last_sent_len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-filter_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->filter_log); + + if (avgidle <= 0) { + /* Overlimit or at-limit */ +#ifdef CBQ_NO_TRICKERY + avgidle = 0; +#else + if (avgidle < cl->minidle) + avgidle = cl->minidle; +#endif + + /* This line was missing in NS. */ + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + + That is not all. + We want to set undertime to the moment, when + the class is allowed to start next transmission i.e. + (undertime + next_pktlen/phys_bandwidth) + - now - next_pktlen/rate = idle + or + undertime = now + idle + next_pktlen/rate + - next_pktlen/phys_bandwidth + + We do not know next packet length, but can + estimate it with average packet length + or current packet_length. + */ + + idle = (-avgidle) - ((-avgidle) >> cl->filter_log); + idle += L2T(q, q->last_sent_len); + idle -= L2T(cl, q->last_sent_len); + PSCHED_TADD2(q->now, idle, cl->undertime); +#ifndef CBQ_NO_TRICKERY + /* Do not forget extra delay :-) */ + PSCHED_TADD(cl->undertime, cl->extradelay); +#endif + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + +#ifdef CBQ_TOPLEVEL_SHARING + cl = q->last_sent; + + if (q->borrowed && q->toplevel >= q->borrowed->level) { + if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime)) + q->toplevel = CBQ_MAXLEVEL; + else if (q->borrowed != cl) + q->toplevel = q->borrowed->level; + } +#endif + + q->last_sent = NULL; +} + +static __inline__ int +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *this_cl = cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL) + return 1; + + if (PSCHED_TLESS(cl->undertime, q->now)) { + q->borrowed = cl; + return 1; + } + + while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)) { + cl = cl->borrow; + if (cl == NULL +#ifdef CBQ_TOPLEVEL_SHARING + || cl->level > q->toplevel +#endif + ) { +#if 0 + this_cl->overlimit(this_cl); +#else + cbq_delay(q, this_cl); +#endif + return 0; + } + } + q->borrowed = cl; + return 1; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + /* Class is empty */ + if (cl->q->q.qlen == 0) + goto skip_class; + + if (fallback) { + /* Fallback pass: all classes are overlimit; + we send from the first class that is allowed + to borrow. + */ + + if (cl->borrow == NULL) + goto skip_class; + } else { + /* Normal pass: check that class is under limit */ + if (!cbq_under_limit(cl)) + goto skip_class; + } + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per this + round. + */ + deficit = 1; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->last_sent = cl; + q->last_sent_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + cl->deficit = 0; + + if (cl->q->q.qlen == 0) { + /* Class is empty, declare it dead */ + cl_prev->next_alive = cl->next_alive; + cl->awake = 0; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + return NULL; + } + } + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch, int fallback) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<<prio); + skb = cbq_dequeue_prio(sch, prio, fallback); + if (skb) + return skb; + } + return NULL; +} + +static struct sk_buff * +cbq_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + + PSCHED_GET_TIME(q->now); + + if (q->last_sent) + cbq_update(q); + + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch, 0); + if (skb) + return skb; + + /* All the classes are overlimit. + Search for overlimit class, which is allowed to borrow + and use it as fallback case. + */ + +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + skb = cbq_dequeue_1(sch, 1); + if (skb) + return skb; + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen && q->wd_expires) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = cbq_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + add_timer(&q->wd_timer); + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + struct cbq_class *cl; + + for (cl = this->parent; cl; cl = cl->parent) { + if (cl->level > this->level) + return; + cl->level = this->level + 1; + this = cl; + } +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + + if (q->quanta[prio] == 0) + return; + + for (cl = q->classes[prio]; cl; cl = cl->next) { + if (cl->rquantum) + cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/ + q->quanta[prio]; + } +} + +static __inline__ int cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + for (clp = &q->classes[this->priority]; (cl = *clp) != NULL; + clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + return 0; + } + } + return -ENOENT; +} + +static int cbq_prune(struct cbq_class *this) +{ + struct cbq_class *cl; + int prio = this->priority; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + qdisc_reset(this->q); + + if (cbq_unlink_class(this)) + return -ENOENT; + + if (this->awake) { + struct cbq_class *cl_prev = q->active[prio]; + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + + if (cl == q->active[prio]) { + q->active[prio] = cl; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + break; + } + } + + cl = cl->next_alive; + cl->deficit += cl->quantum; + break; + } + } while ((cl_prev = cl) != q->active[prio]); + } + + --q->nclasses[prio]; + if (this->rquantum) { + q->quanta[prio] -= this->rquantum; + cbq_normalize_quanta(q, prio); + } + + if (q->fallback_class == this) + q->fallback_class = NULL; + + this->parent = NULL; + this->borrow = NULL; + this->root = this; + this->qdisc = NULL; + return 0; +} + +static int cbq_graft(struct cbq_class *this, struct cbq_class *parent) +{ + struct cbq_class *cl, **clp; + int prio = this->priority; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + qdisc_reset(this->q); + + + for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) + return -EBUSY; + } + + cl->next = NULL; + *clp = cl; + + cl->parent = parent; + cl->borrow = parent; + cl->root = parent ? parent->root : cl; + + ++q->nclasses[prio]; + if (this->rquantum) { + q->quanta[prio] += this->rquantum; + cbq_normalize_quanta(q, prio); + } + + cbq_adjust_levels(this); + + return 0; +} + + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int prio; + + q->activemask = 0; + q->last_sent = NULL; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + } +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + q->active[prio] = NULL; + + for (cl = q->classes[prio]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = 0; + cl->deficit = 0; + cl->awake = 0; + } + } +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl, **clp; + int prio; + + for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + struct cbq_class *cl_head = q->classes[prio]; + + for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) { + qdisc_destroy(cl->q); + kfree(cl); + } + } +} + +static int cbq_control(struct Qdisc *sch, void *arg) +{ + struct cbq_sched_data *q; + + q = (struct cbq_sched_data *)sch->data; + + /* Do attachment here. It is the last thing to do. */ + + return -EINVAL; +} + +static int cbq_init(struct Qdisc *sch, void *arg) +{ + struct cbq_sched_data *q; + struct cbqctl *ctl = (struct cbqctl*)arg; + + q = (struct cbq_sched_data *)sch->data; + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + return 0; +} + + +struct Qdisc_ops cbq_ops = +{ + NULL, + "cbq", + 0, + sizeof(struct cbq_sched_data), + cbq_enqueue, + cbq_dequeue, + cbq_reset, + cbq_destroy, + cbq_init, + cbq_control, +}; + +#ifdef MODULE +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&cbq_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c new file mode 100644 index 000000000..dbc05d31b --- /dev/null +++ b/net/sched/sch_csz.c @@ -0,0 +1,832 @@ +/* + * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Clark-Shenker-Zhang algorithm. + ======================================= + + SOURCE. + + David D. Clark, Scott Shenker and Lixia Zhang + "Supporting Real-Time Applications in an Integrated Services Packet + Network: Architecture and Mechanism". + + CBQ presents a flexible universal algorithm for packet scheduling, + but it has pretty poor delay characteristics. + Round-robin scheduling and link-sharing goals + apparently contradict to minimization of network delay and jitter. + Moreover, correct handling of predicted flows seems to be + impossible in CBQ. + + CSZ presents more precise but less flexible and less efficient + approach. As I understand, the main idea is to create + WFQ flows for each guaranteed service and to allocate + the rest of bandwith to dummy flow-0. Flow-0 comprises + the predicted services and the best effort traffic; + it is handled by a priority scheduler with the highest + priority band allocated for predicted services, and the rest --- + to the best effort packets. + + Note, that in CSZ flows are NOT limited to their bandwidth. + It is supposed, that flow passed admission control at the edge + of QoS network and it more need no shaping. Any attempt to improve + the flow or to shape it to a token bucket at intermediate hops + will introduce undesired delays and raise jitter. + + At the moment CSZ is the only scheduler that provides + real guaranteed service. Another schemes (including CBQ) + do not provide guaranteed delay and randomize jitter. + There exists the statement (Sally Floyd), that delay + can be estimated by a IntServ compliant formulae. + This result is true formally, but it is wrong in principle. + At first, it ignores delays introduced by link sharing. + And the second (and main) it limits bandwidth, + it is fatal flaw. + + ALGORITHM. + + --- Notations. + + $B$ is link bandwidth (bits/sec). + + $I$ is set of all flows, including flow $0$. + Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and + $\sum_{a \in I} r_a = 1$. + + --- Flow model. + + Let $m_a$ is number of backlogged bits in flow $a$. + The flow is {\em active }, if $m_a > 0$. + This number is discontinuous function of time; + when a packet $i$ arrives: + \[ + m_a(t_i+0) - m_a(t_i-0) = L^i, + \] + where $L^i$ is the length of arrived packet. + The flow queue is drained continuously until $m_a == 0$: + \[ + {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}. + \] + I.e. flow rates are their allocated rates proportionally + scaled to take all available link bandwidth. Apparently, + it is not the only possible policy. F.e. CBQ classes + without borrowing would be modelled by: + \[ + {d m_a \over dt} = - B r_a . + \] + More complicated hierarchical bandwidth allocation + policies are possible, but, unfortunately, basic + flows equation have simple solution only for proportional + scaling. + + --- Departure times. + + We calculate time until the last bit of packet will be sent: + \[ + E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a }, + \] + where $\delta_a(t)$ is number of bits drained since $t_i$. + We have to evaluate $E_a^i$ for all queued packets, + then find packet with minimal $E_a^i$ and send it. + + It sounds good, but direct implementation of the algorithm + is absolutely infeasible. Luckily, if flow rates + are scaled proportionally, the equations have simple solution. + + The differential equation for $E_a^i$ is + \[ + {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} = + { B \over \sum_{b \in A} r_b} + \] + with initial condition + \[ + E_a^i (t_i) = { m_a(t_i) \over r_a } . + \] + + Let's introduce an auxiliary function $R(t)$: + + --- Round number. + + Consider the following model: we rotate over active flows, + sending $r_a B$ bits from every flow, so that we send + $B \sum_{a \in A} r_a$ bits per round, that takes + $\sum_{a \in A} r_a$ seconds. + + Hence, $R(t)$ (round number) is monotonically increasing + linear function of time when $A$ is not changed + \[ + { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a } + \] + and it is continuous when $A$ changes. + + The central observation is that the quantity + $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all! + $R(t)$ does not depend on flow, so that $F_a^i$ can be + calculated only once on packet arrival, and we need not + recalculation of $E$ numbers and resorting queues. + Number $F_a^i$ is called finish number of the packet. + It is just value of $R(t)$, when the last bit of packet + will be sent out. + + Maximal finish number on flow is called finish number of flow + and minimal one is "start number of flow". + Apparently, flow is active if and only if $F_a \leq R$. + + When packet of length $L_i$ bit arrives to flow $a$ at time $t_i$, + we calculate number $F_a^i$ as: + + If flow was inactive ($F_a < R$): + $F_a^i = R(t) + {L_i \over B r_a}$ + otherwise + $F_a^i = F_a + {L_i \over B r_a}$ + + These equations complete the algorithm specification. + + It looks pretty hairy, but there exists a simple + procedure for solving these equations. + See procedure csz_update(), that is a generalization of + algorithm from S. Keshav's thesis Chapter 3 + "Efficient Implementation of Fair Queeing". + + NOTES. + + * We implement only the simplest variant of CSZ, + when flow-0 is explicit 4band priority fifo. + It is bad, but we need "peek" operation in addition + to "dequeue" to implement complete CSZ. + I do not want to make it, until it is not absolutely + necessary. + + * A primitive support for token bucket filtering + presents too. It directly contradicts to CSZ, but + though the Internet is on the globe ... :-) + yet "the edges of the network" really exist. + + BUGS. + + * Fixed point arithmetic is overcomplicated, suboptimal and even + wrong. Check it later. +*/ + + +/* This number is arbitrary */ + +#define CSZ_MAX_GUARANTEED 16 + +#define CSZ_FLOW_ID(skb) (CSZ_MAX_GUARANTEED) + +struct csz_head +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; +}; + +struct csz_flow +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; + +/* Parameters */ + unsigned long rate; /* Flow rate. Fixed point is at rate_log */ + unsigned long *L_tab; /* Lookup table for L/(B*r_a) values */ + unsigned long max_bytes; /* Maximal length of queue */ +#ifdef CSZ_PLUS_TBF + unsigned long depth; /* Depth of token bucket, normalized + as L/(B*r_a) */ +#endif + +/* Variables */ +#ifdef CSZ_PLUS_TBF + unsigned long tokens; /* Tokens number: usecs */ + psched_time_t t_tbf; + unsigned long R_tbf; + int throttled; +#endif + unsigned peeked; + unsigned long start; /* Finish number of the first skb */ + unsigned long finish; /* Finish number of the flow */ + + struct sk_buff_head q; /* FIFO queue */ +}; + +#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log]) + +struct csz_sched_data +{ +/* Parameters */ + unsigned char cell_log; /* 1<<cell_log is quantum of packet size */ + unsigned char rate_log; /* fixed point position for rate; + * really we need not it */ + unsigned char R_log; /* fixed point position for round number */ + unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs; + * 21 <-> 2.1sec is MAXIMAL value */ + +/* Variables */ +#ifdef CSZ_PLUS_TBF + struct timer_list wd_timer; + long wd_expires; +#endif + psched_time_t t_c; /* Time check-point */ + unsigned long R_c; /* R-number check-point */ + unsigned long rate; /* Current sum of rates of active flows */ + struct csz_head s; /* Flows sorted by "start" */ + struct csz_head f; /* Flows sorted by "finish" */ + + struct sk_buff_head other[4];/* Predicted (0) and the best efforts + classes (1,2,3) */ + struct csz_flow flow[CSZ_MAX_GUARANTEED]; /* Array of flows */ +}; + +/* These routines (csz_insert_finish and csz_insert_start) are + the most time consuming part of all the algorithm. + + We insert to sorted list, so that time + is linear with respect to number of active flows in the worst case. + Note that we have not very large number of guaranteed flows, + so that logarithmic algorithms (heap etc.) are useless, + they are slower than linear one when length of list <= 32. + + Heap would take sence if we used WFQ for best efforts + flows, but SFQ is better choice in this case. + */ + + +/* Insert flow "this" to the list "b" before + flow with greater finish number. + */ + +#if 0 +/* Scan forward */ +extern __inline__ void csz_insert_finish(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->fnext; + unsigned long finish = this->finish; + + while (f != b) { + if (((struct csz_flow*)f)->finish - finish > 0) + break; + f = f->fnext; + } + this->fnext = f; + this->fprev = f->fprev; + this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; +} +#else +/* Scan backward */ +extern __inline__ void csz_insert_finish(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->fprev; + unsigned long finish = this->finish; + + while (f != b) { + if (((struct csz_flow*)f)->finish - finish <= 0) + break; + f = f->fprev; + } + this->fnext = f->fnext; + this->fprev = f; + this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; +} +#endif + +/* Insert flow "this" to the list "b" before + flow with greater start number. + */ + +extern __inline__ void csz_insert_start(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->snext; + unsigned long start = this->start; + + while (f != b) { + if (((struct csz_flow*)f)->start - start > 0) + break; + f = f->snext; + } + this->snext = f; + this->sprev = f->sprev; + this->snext->sprev = this->sprev->snext = (struct csz_head*)this; +} + + +/* Calculate and return current round number. + It is another time consuming part, but + it is impossible to avoid it. + + Fixed point arithmetic is not ... does not ... Well, it is just CRAP. + */ + +static unsigned long csz_update(struct Qdisc *sch) +{ + struct csz_sched_data *q = (struct csz_sched_data*)sch->data; + struct csz_flow *a; + unsigned long F; + unsigned long tmp; + psched_time_t now; + unsigned long delay; + unsigned long R_c; + + PSCHED_GET_TIME(now); + delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset); + + if (delay>>q->delta_log) { +do_reset: + /* Delta is too large. + It is possible if MTU/BW > 1<<q->delta_log + (i.e. configuration error) or because of hardware + fault. We have no choice... + */ + qdisc_reset(sch); + return 0; + } + + q->t_c = now; + + for (;;) { + a = (struct csz_flow*)q->f.fnext; + + /* No more active flows. Reset R and exit. */ + if (a == (struct csz_flow*)&q->f) { +#ifdef CSZ_DEBUG + if (q->rate) { + printk("csz_update: rate!=0 on inactive csz\n"); + q->rate = 0; + } +#endif + q->R_c = 0; + return 0; + } + + F = a->finish; + +#ifdef CSZ_DEBUG + if (q->rate == 0) { + printk("csz_update: rate=0 on active csz\n"); + goto do_reset; + } +#endif + + /* + * tmp = (t - q->t_c)/q->rate; + */ + + tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log); + + tmp += q->R_c; + + /* OK, this flow (and all flows with greater + finish numbers) is still active */ + if (F - tmp > 0) + break; + + /* It is more not active */ + + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + + /* + * q->t_c += (F - q->R_c)*q->rate + */ + + tmp = ((F-q->R_c)*q->rate)<<q->R_log; + R_c = F; + q->rate -= a->rate; + + if (delay - tmp >= 0) { + delay -= tmp; + continue; + } + delay = 0; + } + + q->R_c = tmp; + return tmp; +} + +static int +csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned flow_id = CSZ_FLOW_ID(skb); + unsigned long R; + int prio; + struct csz_flow *this; + + if (flow_id >= CSZ_MAX_GUARANTEED) { + prio = flow_id - CSZ_MAX_GUARANTEED; + flow_id = 0; + } + + this = &q->flow[flow_id]; + if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) { + kfree_skb(skb, FREE_WRITE); + return 0; + } + + R = csz_update(sch); + + if (this->finish - R >= 0) { + /* It was active */ + this->finish += L2R(q,this,skb->len); + } else { + /* It is inactive; activate it */ + this->finish = R + L2R(q,this,skb->len); + q->rate += this->rate; + csz_insert_finish(&q->f, this); + } + + /* If this flow was empty, remember start number + and insert it into start queue */ + if (this->q.qlen == 0) { + this->start = this->finish; + csz_insert_start(&q->s, this); + } + if (flow_id) + skb_queue_tail(&this->q, skb); + else + skb_queue_tail(&q->other[prio], skb); + sch->q.qlen++; + return 1; +} + +static __inline__ struct sk_buff * +skb_dequeue_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_dequeue(&q->other[i]); + if (skb) { + q->flow[0].q.qlen--; + return skb; + } + } + return NULL; +} + +static __inline__ struct sk_buff * +skb_peek_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_peek(&q->other[i]); + if (skb) + return skb; + } + return NULL; +} + +#ifdef CSZ_PLUS_TBF + +static void csz_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct csz_sched_data *q = (struct csz_sched_data*)sch->data; + + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + + qdisc_wakeup(sch->dev); +} + +static __inline__ void +csz_move_queue(struct csz_flow *this, long delta) +{ + this->fprev->fnext = this->fnext; + this->fnext->fprev = this->fprev; + + this->start += delta; + this->finish += delta; + + csz_insert_finish(this); +} + +static __inline__ int csz_enough_tokens(struct csz_sched_data *q, + struct csz_flow *this, + struct sk_buff *skb) +{ + long toks; + long shift; + psched_time_t now; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len); + + shift = 0; + if (this->throttled) { + /* Remember aposteriory delay */ + + unsigned long R = csz_update(q); + shift = R - this->R_tbf; + this->R_tbf = R; + } + + if (toks >= 0) { + /* Now we have enough tokens to proceed */ + + this->tokens = toks <= this->depth ? toks ? this->depth; + this->t_tbf = now; + + if (!this->throttled) + return 1; + + /* Flow was throttled. Update its start&finish numbers + with delay calculated aposteriori. + */ + + this->throttled = 0; + if (shift > 0) + csz_move_queue(this, shift); + return 1; + } + + if (!this->throttled) { + /* Flow has just been throttled; remember + current round number to calculate aposteriori delay + */ + this->throttled = 1; + this->R_tbf = csz_update(q); + } + + /* Move all the queue to the time when it will be allowed to send. + We should translate time to round number, but it is impossible, + so that we made the most conservative estimate i.e. we suppose + that only this flow is active and, hence, R = t. + Really toks <= R <= toks/r_a. + + This apriory shift in R will be adjusted later to reflect + real delay. We cannot avoid it because of: + - throttled flow continues to be active from the viewpoint + of CSZ, so that it would acquire highest priority, + if you not adjusted start numbers. + - Eventually, finish number would become less than round + number and flow were declared inactive. + */ + + toks = -toks; + + /* Remeber, that we should start watchdog */ + if (toks < q->wd_expires) + q->wd_expires = toks; + + toks >>= q->R_log; + shift += toks; + if (shift > 0) { + this->R_tbf += toks; + csz_move_queue(this, shift); + } + csz_insert_start(this); + return 0; +} +#endif + + +static struct sk_buff * +csz_dequeue(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct sk_buff *skb; + struct csz_flow *this; + +#ifdef CSZ_PLUS_TBF + q->wd_expires = 0; +#endif + this = (struct csz_flow*)q->s.snext; + + while (this != (struct csz_flow*)&q->s) { + + /* First of all: unlink from start list */ + this->sprev->snext = this->snext; + this->snext->sprev = this->sprev; + + if (this != &q->flow[0]) { /* Guaranteed flow */ + skb = __skb_dequeue(&this->q); + if (skb) { +#ifdef CSZ_PLUS_TBF + if (this->depth) { + if (!csz_enough_tokens(q, this, skb)) + continue; + } +#endif + if (this->q.qlen) { + struct sk_buff *nskb = skb_peek(&this->q); + this->start += L2R(q,this,nskb->len); + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } else { /* Predicted or best effort flow */ + skb = skb_dequeue_best(q); + if (skb) { + unsigned peeked = this->peeked; + this->peeked = 0; + + if (--this->q.qlen) { + struct sk_buff *nskb; + unsigned dequeued = L2R(q,this,skb->len); + + /* We got not the same thing that + peeked earlier; adjust start number + */ + if (peeked != dequeued && peeked) + this->start += dequeued - peeked; + + nskb = skb_peek_best(q); + peeked = L2R(q,this,nskb->len); + this->start += peeked; + this->peeked = peeked; + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } + } +#ifdef CSZ_PLUS_TBF + /* We are about to return no skb. + Schedule watchdog timer, if it occured because of shaping. + */ + if (q->wd_expires) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = csz_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + add_timer(&q->wd_timer); + } +#endif + return NULL; +} + +static void +csz_reset(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct sk_buff *skb; + int i; + + for (i=0; i<4; i++) + while ((skb=skb_dequeue(&q->other[i])) != NULL) + kfree_skb(skb, 0); + + for (i=0; i<CSZ_MAX_GUARANTEED; i++) { + struct csz_flow *this = q->flow + i; + while ((skb = skb_dequeue(&this->q)) != NULL) + kfree_skb(skb, FREE_WRITE); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + PSCHED_GET_TIME(&q->t_tbf); + q->tokens = q->depth; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.function = NULL; + } +#endif + sch->q.qlen = 0; +} + +static void +csz_destroy(struct Qdisc* sch) +{ +/* + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int i; + + for (i=0; i<4; i++) + qdisc_destroy(q->other[i]); + */ +} + +static int csz_init(struct Qdisc *sch, void *arg) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct cszinitctl *ctl = (struct cszinitctl*)arg; + int i; + + for (i=0; i<4; i++) + skb_queue_head_init(&q->other[i]); + + for (i=0; i<CSZ_MAX_GUARANTEED; i++) { + struct csz_flow *this = q->flow + i; + skb_queue_head_init(&this->q); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; +#endif + if (ctl) { + if (ctl->flows != CSZ_MAX_GUARANTEED) + return -EINVAL; + q->cell_log = ctl->cell_log; + } + return 0; +} + +static int csz_control(struct Qdisc *sch, struct pschedctl *gctl) +{ +/* + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct cszctl *ctl = (struct cszctl*)gctl->arg; + struct sk_buff *skb; + int i; + + if (op == PSCHED_TC_ATTACH) { + + } +*/ + return 0; +} + + + + +struct Qdisc_ops csz_ops = +{ + NULL, + "csz", + 0, + sizeof(struct csz_sched_data), + csz_enqueue, + csz_dequeue, + csz_reset, + csz_destroy, + csz_init, + csz_control, +}; + + +#ifdef MODULE +#include <linux/module.h> +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&csz_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 000000000..8134baf16 --- /dev/null +++ b/net/sched/sch_fifo.c @@ -0,0 +1,179 @@ +/* + * net/sched/sch_fifo.c Simple FIFO "scheduler" + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* 1 band FIFO pseudo-"scheduler" */ + +struct fifo_sched_data +{ + int qmaxbytes; + int qmaxlen; + int qbytes; +}; + +static int +bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (q->qbytes <= q->qmaxbytes) { + skb_queue_tail(&sch->q, skb); + q->qbytes += skb->len; + return 0; + } + kfree_skb(skb, FREE_WRITE); + return 1; +} + +static struct sk_buff * +bfifo_dequeue(struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + struct sk_buff *skb; + + skb = skb_dequeue(&sch->q); + if (skb) + q->qbytes -= skb->len; + return skb; +} + +static void +bfifo_reset(struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q)) != NULL) { + q->qbytes -= skb->len; + kfree_skb(skb,FREE_WRITE); + } + if (q->qbytes) { + printk("fifo_reset: qbytes=%d\n", q->qbytes); + q->qbytes = 0; + } +} + +static int +pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (sch->q.qlen <= q->qmaxlen) { + skb_queue_tail(&sch->q, skb); + return 0; + } + kfree_skb(skb, FREE_WRITE); + return 1; +} + +static struct sk_buff * +pfifo_dequeue(struct Qdisc* sch) +{ + return skb_dequeue(&sch->q); +} + +static void +pfifo_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q))!=NULL) + kfree_skb(skb,FREE_WRITE); +} + + +static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */) +{ + struct fifo_sched_data *q; +/* + struct device *dev = sch->dev; + */ + + q = (struct fifo_sched_data *)sch->data; +/* + if (pkts<0) + pkts = dev->tx_queue_len; + if (bytes<0) + bytes = pkts*dev->mtu; + q->qmaxbytes = bytes; + q->qmaxlen = pkts; + */ + return 0; +} + +struct Qdisc_ops pfifo_ops = +{ + NULL, + "pfifo", + 0, + sizeof(struct fifo_sched_data), + pfifo_enqueue, + pfifo_dequeue, + pfifo_reset, + NULL, + fifo_init, +}; + +struct Qdisc_ops bfifo_ops = +{ + NULL, + "pfifo", + 0, + sizeof(struct fifo_sched_data), + bfifo_enqueue, + bfifo_dequeue, + bfifo_reset, + NULL, + fifo_init, +}; + +#ifdef MODULE +#include <linux/module.h> +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&pfifo_ops); + if (err == 0) { + err = register_qdisc(&bfifo_ops); + if (err) + unregister_qdisc(&pfifo_ops); + } + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 000000000..83aa8d10e --- /dev/null +++ b/net/sched/sch_generic.c @@ -0,0 +1,541 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +struct Qdisc_head qdisc_head = { &qdisc_head }; + +static struct Qdisc_ops *qdisc_base = NULL; + +/* NOTES. + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns number of enqueued packets i.e. this number is 1, + if packet was enqueued sucessfully and <1 if something (not + necessary THIS packet) was dropped. + + */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (strcmp(qops->id, q->id) == 0) + return -EEXIST; + qops->next = NULL; + qops->refcnt = 0; + *qp = qops; + return 0; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (!q) + return -ENOENT; + *qp = q->next; + return 0; +} + +struct Qdisc *qdisc_lookup(int handle) +{ + return NULL; +} + + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + in all curcumstances. It is difficult to invent anything more + fast or cheap. + */ + +static int +noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct sk_buff * +noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +struct Qdisc noop_qdisc = +{ + { NULL }, + noop_enqueue, + noop_dequeue, +}; + +struct Qdisc noqueue_qdisc = +{ + { NULL }, + NULL, + NULL, +}; + + +/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */ + +static int +pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + + if (list->qlen <= skb->dev->tx_queue_len) { + skb_queue_tail(list, skb); + return 1; + } + qdisc->dropped++; + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct sk_buff * +pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + struct sk_buff *skb; + + for (prio = 0; prio < 3; prio++, list++) { + skb = skb_dequeue(list); + if (skb) + return skb; + } + return NULL; +} + +static void +pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + + for (prio=0; prio < 3; prio++) + skb_queue_purge(list+prio); +} + +static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) +{ + int i; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data); + + for(i=0; i<3; i++) + skb_queue_head_init(list+i); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops = +{ + NULL, + "pfifo_fast", + 1, + 3 * sizeof(struct sk_buff_head), + pfifo_fast_enqueue, + pfifo_fast_dequeue, + pfifo_fast_reset, + NULL, + pfifo_fast_init +}; + +static struct Qdisc * +qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) +{ + struct Qdisc *sch; + int size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + if (!sch) + return NULL; + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + skb_queue_head_init(&sch->failure_q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + if (ops->init && ops->init(sch, arg)) + return NULL; + ops->refcnt++; + return sch; +} + +void qdisc_reset(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + if (ops) { + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + skb_queue_purge(&qdisc->failure_q); + end_bh_atomic(); + } +} + +void qdisc_destroy(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + if (ops) { + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + skb_queue_purge(&qdisc->failure_q); + ops->refcnt--; + end_bh_atomic(); + kfree(qdisc); + } +} + +static void dev_do_watchdog(unsigned long dummy); + +static struct timer_list dev_watchdog = + { NULL, NULL, 0L, 0L, &dev_do_watchdog }; + +static void dev_do_watchdog(unsigned long dummy) +{ + struct Qdisc_head *h; + + for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) { + qdisc_restart(dev); + } + } + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); +} + + +void dev_activate(struct device *dev) +{ + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual intrfaces + */ + + if (dev->qdisc_sleeping == &noop_qdisc) { + if (dev->tx_queue_len) { + struct Qdisc *qdisc; + qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL); + if (qdisc == NULL) + return; + dev->qdisc_sleeping = qdisc; + } else + dev->qdisc_sleeping = &noqueue_qdisc; + } + + start_bh_atomic(); + if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { + dev->qdisc->tx_timeo = 5*HZ; + dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo; + if (!dev_watchdog.expires) { + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); + } + } + end_bh_atomic(); +} + +void dev_deactivate(struct device *dev) +{ + struct Qdisc *qdisc; + + start_bh_atomic(); + + qdisc = dev->qdisc; + dev->qdisc = &noop_qdisc; + + qdisc_reset(qdisc); + + if (qdisc->h.forw) { + struct Qdisc_head **hp, *h; + + for (hp = &qdisc_head.forw; (h = *hp) != &qdisc_head; hp = &h->forw) { + if (h == &qdisc->h) { + *hp = h->forw; + break; + } + } + } + + end_bh_atomic(); +} + +void dev_init_scheduler(struct device *dev) +{ + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; +} + +void dev_shutdown(struct device *dev) +{ + struct Qdisc *qdisc; + + start_bh_atomic(); + qdisc = dev->qdisc_sleeping; + dev->qdisc_sleeping = &noop_qdisc; + qdisc_destroy(qdisc); + end_bh_atomic(); +} + +void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + start_bh_atomic(); + oqdisc = dev->qdisc_sleeping; + + /* Destroy old scheduler */ + if (oqdisc) + qdisc_destroy(oqdisc); + + /* ... and attach new one */ + dev->qdisc_sleeping = qdisc; + dev->qdisc = &noop_qdisc; + end_bh_atomic(); + + if (dev->flags & IFF_UP) + dev_activate(dev); +} + +/* Kick the queue "q". + Note, that this procedure is called by watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called only from NET BH +*/ + + +int qdisc_restart(struct device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + + skb = skb_dequeue(&q->failure_q); + if (!skb) { + skb = q->dequeue(q); + if (netdev_nit && skb) + dev_queue_xmit_nit(skb,dev); + } + if (skb) { + if (dev->hard_start_xmit(skb, dev) == 0) { + q->tx_last = jiffies; + return -1; + } +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "netdevice %s defers output.\n", dev->name); +#endif + skb_queue_head(&q->failure_q, skb); + return -1; + } + return q->q.qlen; +} + +void qdisc_run_queues(void) +{ + struct Qdisc_head **hp, *h; + + hp = &qdisc_head.forw; + while ((h = *hp) != &qdisc_head) { + int res = -1; + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + + while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) + /* NOTHING */; + + /* The explanation is necessary here. + qdisc_restart called dev->hard_start_xmit, + if device is virtual, it could trigger one more + dev_queue_xmit and new device could appear + in active chain. In this case we cannot unlink + empty queue, because we lost back pointer. + No problem, we will unlink it during the next round. + */ + + if (res == 0 && *hp == h) { + *hp = h->forw; + h->forw = NULL; + continue; + } + hp = &h->forw; + } +} + + +int tc_init(struct pschedctl *pctl) +{ + struct Qdisc *q; + struct Qdisc_ops *qops; + + if (pctl->handle) { + q = qdisc_lookup(pctl->handle); + if (q == NULL) + return -ENOENT; + qops = q->ops; + if (pctl->ifindex && q->dev->ifindex != pctl->ifindex) + return -EINVAL; + } + return -EINVAL; +} + +int tc_destroy(struct pschedctl *pctl) +{ + return -EINVAL; +} + +int tc_attach(struct pschedctl *pctl) +{ + return -EINVAL; +} + +int tc_detach(struct pschedctl *pctl) +{ + return -EINVAL; +} + + +int psched_ioctl(void *arg) +{ + struct pschedctl ctl; + struct pschedctl *pctl = &ctl; + int err; + + if (copy_from_user(&ctl, arg, sizeof(ctl))) + return -EFAULT; + + if (ctl.arglen > 0) { + pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL); + if (pctl == NULL) + return -ENOBUFS; + memcpy(pctl, &ctl, sizeof(ctl)); + if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) { + kfree(pctl); + return -EFAULT; + } + } + + rtnl_lock(); + + switch (ctl.command) { + case PSCHED_TC_INIT: + err = tc_init(pctl); + break; + case PSCHED_TC_DESTROY: + err = tc_destroy(pctl); + break; + case PSCHED_TC_ATTACH: + err = tc_attach(pctl); + break; + case PSCHED_TC_DETACH: + err = tc_detach(pctl); + break; + default: + err = -EINVAL; + } + + rtnl_unlock(); + + if (pctl != &ctl) + kfree(pctl); + return err; +} + +__initfunc(int pktsched_init(void)) +{ +#define INIT_QDISC(name) { \ + extern struct Qdisc_ops name##_ops; \ + register_qdisc(&##name##_ops); \ + } + + skb_queue_head_init(&noop_qdisc.failure_q); + skb_queue_head_init(&noqueue_qdisc.failure_q); + + register_qdisc(&pfifo_fast_ops); +#ifdef CONFIG_NET_SCH_CBQ + INIT_QDISC(cbq); +#endif +#ifdef CONFIG_NET_SCH_CSZ + INIT_QDISC(csz); +#endif +#ifdef CONFIG_NET_SCH_RED + INIT_QDISC(red); +#endif +#ifdef CONFIG_NET_SCH_SFQ + INIT_QDISC(sfq); +#endif +#ifdef CONFIG_NET_SCH_TBF + INIT_QDISC(tbf); +#endif +#ifdef CONFIG_NET_SCH_PFIFO + INIT_QDISC(pfifo); + INIT_QDISC(bfifo); +#endif +#ifdef CONFIG_NET_SCH_PRIO + INIT_QDISC(prio); +#endif + return 0; +} diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 000000000..a3806eda4 --- /dev/null +++ b/net/sched/sch_prio.c @@ -0,0 +1,146 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* New N-band generic scheduler */ + +struct prio_sched_data +{ + int qbytes; + int bands; + u8 prio2band[8]; + struct Qdisc *queues[8]; +}; + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio = q->prio2band[skb->priority&7]; + struct Qdisc *qdisc; + + qdisc = q->queues[prio]; + if (qdisc->enqueue(skb, qdisc) == 0) { + q->qbytes += skb->len; + sch->q.qlen++; + return 0; + } + return 1; +} + +static struct sk_buff * +prio_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = 0; prio < q->bands; prio++) { + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + q->qbytes -= skb->len; + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static void +prio_reset(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; prio<q->bands; prio++) + qdisc_reset(q->queues[prio]); + q->qbytes = 0; +} + +static void +prio_destroy(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; prio<q->bands; prio++) { + qdisc_destroy(q->queues[prio]); + q->queues[prio] = &noop_qdisc; + } +} + +static int prio_init(struct Qdisc *sch, void *arg) +{ + const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; + struct prio_sched_data *q; + int i; + + q = (struct prio_sched_data *)sch->data; + q->bands = 3; + memcpy(q->prio2band, prio2band, sizeof(prio2band)); + for (i=0; i<q->bands; i++) + q->queues[i] = &noop_qdisc; + return 0; +} + +struct Qdisc_ops prio_ops = +{ + NULL, + "prio", + 0, + sizeof(struct prio_sched_data), + prio_enqueue, + prio_dequeue, + prio_reset, + prio_destroy, + prio_init, +}; + +#ifdef MODULE +#include <linux/module.h> +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&prio_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 000000000..fd3ee43ac --- /dev/null +++ b/net/sched/sch_red.c @@ -0,0 +1,303 @@ +/* + * net/sched/sch_red.c Random Early Detection scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + written down in Fig.17 of the paper. + +Short description. +------------------ + + When new packet arrives we calculate average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is filter time constant (choosen as 2^(-Wlog)), controlling + inertia of algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1!). + + NB. SF&VJ assumed that Pb[avg] is linear function. I think it + is wrong. I'd make: + P[th_min] = 0, P[th_max] = 1; + dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number. + + I choose max_P as a number between 0.01 and 0.1, so that + C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log) + + Parameters, settable by user (with default values): + + qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect algorithm behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be achieved + if RED works correctly. + qth_min=32K + qth_max=128K - qth_max should be at least 2*qth_min + Wlog=8 - log(1/W). + Alog=Wlog - fixed point position in th_min and th_max. + Rlog=10 + C1log=24 - C1log = trueC1log+Alog-Rlog + so that trueC1log=22 and max_P~0.02 + + +NOTES: + +Upper bound on W. +----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 -th_min/S < (1-(1-W)^L)/W + + For th_min/S = 32 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +struct red_sched_data +{ +/* Parameters */ + unsigned long qmaxbytes; /* HARD maximal queue length */ + unsigned long qth_min; /* Min average length threshold: A scaled */ + unsigned long qth_max; /* Max average length threshold: A scaled */ + char Alog; /* Point position in average lengths */ + char Wlog; /* log(W) */ + char Rlog; /* random number bits */ + char C1log; /* log(1/C1) */ + char Slog; + char Stab[256]; + +/* Variables */ + unsigned long qbytes; /* Queue length in bytes */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + unsigned qR; /* Cached random number [0..1<Rlog) */ + psched_time_t qidlestart; /* Start of idle period */ +}; + +/* Stolen from igmp.c. */ + +static __inline__ unsigned red_random(int log) +{ + static unsigned long seed=152L; + seed=seed*69069L+1; + return (seed^jiffies)&((1<<log)-1); +} + +static int +red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + psched_time_t now; + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + PSCHED_SET_PASTPERFECT(q->qidlestart); + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<<q->Slog)-1, 0); + +/* It is wrong, but I do not think that SF+VJ proposal is reasonable + and did not invented anything more clever 8) + + The problem: ideally, average length queue recalcultion should + be done over constant clock intervals. It is too expensive, so that + calculation is driven by outgoing packets. + When queue is idle we have to model this clock by hands. + + SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size + dummy packets as burst after idle time, i.e. + + q->qave *= (1-W)^m + + It is apparently overcomplicated solution (f.e. we have to precompute + a table to make this calculation for reasonable time) + I believe, that a simpler model may be used here, + but it is field for experiments. +*/ + q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF]; + } + + q->qave += ((q->qbytes<<q->Alog) - q->qave) >> q->Wlog; + + if (q->qave < q->qth_min) { +enqueue: + q->qcount = -1; + if (q->qbytes <= q->qmaxbytes) { + skb_queue_tail(&sch->q, skb); + q->qbytes += skb->len; + return 1; + } +drop: + kfree_skb(skb, FREE_WRITE); + return 0; + } + if (q->qave >= q->qth_max) { + q->qcount = -1; + goto drop; + } + q->qcount++; + if (q->qcount++) { + if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = red_random(q->Rlog); + goto drop; + } + q->qR = red_random(q->Rlog); + goto enqueue; +} + +static struct sk_buff * +red_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + skb = skb_dequeue(&sch->q); + if (skb) { + q->qbytes -= skb->len; + return skb; + } + PSCHED_GET_TIME(q->qidlestart); + return NULL; +} + +static void +red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q))!=NULL) { + q->qbytes -= skb->len; + kfree_skb(skb,FREE_WRITE); + } + if (q->qbytes) { + printk("red_reset: qbytes=%lu\n", q->qbytes); + q->qbytes = 0; + } + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; +} + +static int red_init(struct Qdisc *sch, struct pschedctl *pctl) +{ + struct red_sched_data *q; + struct redctl *ctl = (struct redctl*)pctl->args; + + q = (struct red_sched_data *)sch->data; + + if (pctl->arglen < sizeof(struct redctl)) + return -EINVAL; + + q->Wlog = ctl->Wlog; + q->Alog = ctl->Alog; + q->Rlog = ctl->Rlog; + q->C1log = ctl->C1log; + q->Slog = ctl->Slog; + q->qth_min = ctl->qth_min; + q->qth_max = ctl->qth_max; + q->qmaxbytes = ctl->qmaxbytes; + memcpy(q->Stab, ctl->Stab, 256); + + q->qcount = -1; + PSCHED_SET_PASTPERFECT(q->qidlestart); + return 0; +} + +struct Qdisc_ops red_ops = +{ + NULL, + "red", + 0, + sizeof(struct red_sched_data), + red_enqueue, + red_dequeue, + red_reset, + NULL, + red_init, + NULL +}; + + +#ifdef MODULE +#include <linux/module.h> +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&red_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 000000000..65c3906b4 --- /dev/null +++ b/net/sched/sch_sfq.c @@ -0,0 +1,333 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + It is not the thing that usually called (W)FQ nowadays. It does not + use any timestamp mechanism, but instead processes queues + in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considred as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolation interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + maximal mtu to 2^15-1; number of hash buckets to 1024. + The only goal of this restrictions was that all data + fitted to one 4K page :-). Struct sfq_sched_data is + organized in anti-cache manner: all the data for bucket + scattered over different locations. It is not good, + but it allowed to put it into 4K. + + It is easy to increase these values. +*/ + +#define SFQ_DEPTH 128 +#define SFQ_HASH_DIVISOR 1024 + +#define SFQ_HASH(a) 0 + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned char sfq_index; + +struct sfq_head +{ + sfq_index next; + sfq_index prev; +}; + +struct sfq_sched_data +{ +/* Parameters */ + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + +/* Variables */ + sfq_index tail; /* Index of current slot in round */ + sfq_index max_depth; /* Maximal depth */ + + sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index next[SFQ_DEPTH]; /* Active slots link */ + short allot[SFQ_DEPTH]; /* Current allotment per slot */ + unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ + struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ + struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ +}; + +extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d = q->qs[x].qlen; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + sfq_link(q, x); +} + +extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + sfq_link(q, x); +} + +static __inline__ void sfq_drop(struct sfq_sched_data *q) +{ + struct sk_buff *skb; + sfq_index d = q->max_depth; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d != 1) { + sfq_index x = q->dep[d].next; + skb = q->qs[x].prev; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb, FREE_WRITE); + sfq_dec(q, x); +/* + sch->q.qlen--; + */ + return; + } + + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb, FREE_WRITE); + sfq_dec(q, d); +/* + sch->q.qlen--; + */ + q->ht[q->hash[d]] = SFQ_DEPTH; + return; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = SFQ_HASH(skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < SFQ_DEPTH-1) + return 1; + + sfq_drop(q); + return 0; +} + +static struct sk_buff * +sfq_dequeue(struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + struct sk_buff *skb; + sfq_index a, old_a; + + /* No active slots */ + if (q->tail == SFQ_DEPTH) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + sfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + a = q->next[a]; + if (a == old_a) { + q->tail = SFQ_DEPTH; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + return skb; +} + +static void +sfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb, FREE_WRITE); +} + + +static int sfq_open(struct Qdisc *sch, void *arg) +{ + struct sfq_sched_data *q; + int i; + + q = (struct sfq_sched_data *)sch->data; + + for (i=0; i<SFQ_HASH_DIVISOR; i++) + q->ht[i] = SFQ_DEPTH; + for (i=0; i<SFQ_DEPTH; i++) { + skb_queue_head_init(&q->qs[i]); + q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; + q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + } + q->max_depth = 0; + q->tail = SFQ_DEPTH; + q->quantum = sch->dev->mtu; + if (sch->dev->hard_header) + q->quantum += sch->dev->hard_header_len; + for (i=0; i<SFQ_DEPTH; i++) + sfq_link(q, i); + return 0; +} + + +struct Qdisc_ops sfq_ops = +{ + NULL, + "sfq", + 0, + sizeof(struct sfq_sched_data), + sfq_enqueue, + sfq_dequeue, + sfq_reset, + NULL, + sfq_open, +}; + +#ifdef MODULE +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&sfq_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c new file mode 100644 index 000000000..9869af1d3 --- /dev/null +++ b/net/sched/sch_tbf.c @@ -0,0 +1,252 @@ +/* + * net/sched/sch_tbf.c Token Bucket Filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + + None. + + ALGORITHM. + + Sequence of packets satisfy token bucket filter with + rate $r$ and depth $b$, if all the numbers defined by: + \begin{eqnarray*} + n_0 &=& b, \\ + n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ), + \end{eqnarray*} + where $t_i$ --- departure time of $i$-th packet and + $L_i$ -- its length, never less than zero. + + It is convenient to rescale $n_i$ by factor $r$, so + that the sequence has "canonical" form: + \[ + n_0 = b/r, + n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r }, + \] + + If a packet has n_i < 0, we throttle filter + by $-n_i$ usecs. + + NOTES. + + If TBF throttles, it starts watchdog timer, which will wake up it + after 0...10 msec. + If no new packets will arrive during this period, + or device will not be awaken by EOI for previous packet, + tbf could stop its activity for 10 msec. + + It means that tbf will sometimes introduce pathological + 10msec delays to flow corresponding to rate*10msec bytes. + For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb. + This number puts lower reasonbale bound on token bucket depth, + but even if depth is larger traffic is erratic at large rates. + + This problem is not specific for THIS implementation. Really, + there exists statement that any attempt to shape traffic + in transit will increase delays and jitter much more than + we expected naively. + + Particularily, it means that delay/jitter sensitive traffic + MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches. +*/ + +struct tbf_sched_data +{ +/* Parameters */ + int cell_log; /* 1<<cell_log is quantum of packet size */ + unsigned long L_tab[256]; /* Lookup table for L/B values */ + unsigned long depth; /* Token bucket depth/B: MUST BE >= MTU/B */ + unsigned long max_bytes; /* Maximal length of backlog: bytes */ + +/* Variables */ + unsigned long bytes; /* Current length of backlog */ + unsigned long tokens; /* Current number of tokens */ + psched_time_t t_c; /* Time check-point */ + struct timer_list wd_timer; /* Watchdog timer */ +}; + +#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log]) + +static int +tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + __skb_queue_tail(&sch->q, skb); + if ((q->bytes += skb->len) <= q->max_bytes) + return 1; + + /* Drop action: undo the things that we just made, + * i.e. make tail drop + */ + + __skb_unlink(skb, &sch->q); + q->bytes -= skb->len; + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static void tbf_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + q->wd_timer.function = NULL; + + qdisc_wakeup(sch->dev); +} + + +static struct sk_buff * +tbf_dequeue(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + + if (skb) { + psched_time_t now; + long toks; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0) + + q->tokens - L2T(q,skb->len); + + if (toks >= 0) { + q->t_c = now; + q->tokens = toks <= q->depth ? toks : q->depth; + q->bytes -= skb->len; + return skb; + } + + /* Maybe, we have in queue a shorter packet, + which can be sent now. It sounds cool, + but, however, wrong in principle. + We MUST NOT reorder packets in these curcumstances. + + Really, if we splitted flow to independent + subflows, it would be very good solution. + Look at sch_csz.c. + */ + __skb_queue_head(&sch->q, skb); + + if (!sch->dev->tbusy) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = tbf_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks); + add_timer(&q->wd_timer); + } + } + return NULL; +} + + +static void +tbf_reset(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sch->q)) != NULL) + kfree_skb(skb, FREE_WRITE); + q->bytes = 0; + PSCHED_GET_TIME(q->t_c); + q->tokens = q->depth; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.function = NULL; + } +} + +static int tbf_init(struct Qdisc* sch, void *arg) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct tbfctl *ctl = (struct tbfctl*)arg; + + PSCHED_GET_TIME(q->t_c); + init_timer(&q->wd_timer); + q->wd_timer.function = NULL; + q->wd_timer.data = (unsigned long)sch; + if (ctl) { + q->max_bytes = ctl->bytes; + q->depth = ctl->depth; + q->tokens = q->tokens; + q->cell_log = ctl->cell_log; + memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long)); + } + return 0; +} + +struct Qdisc_ops tbf_ops = +{ + NULL, + "tbf", + 0, + sizeof(struct tbf_sched_data), + tbf_enqueue, + tbf_dequeue, + tbf_reset, + NULL, + tbf_init, + NULL, +}; + + +#ifdef MODULE +#include <linux/module.h> +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&tbf_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif |