diff options
Diffstat (limited to 'net/sched/sch_cbq.c')
-rw-r--r-- | net/sched/sch_cbq.c | 2124 |
1 files changed, 2124 insertions, 0 deletions
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 000000000000..d43e3b8cbf6a --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,2124 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. + + ----------------------------------------------------------------------- + + Algorithm skeleton was taken from NS simulator cbq.cc. + If someone wants to check this code against the LBL version, + he should take into account that ONLY the skeleton was borrowed, + the implementation is different. Particularly: + + --- The WRR algorithm is different. Our version looks more + reasonable (I hope) and works when quanta are allowed to be + less than MTU, which is always the case when real time classes + have small rates. Note, that the statement of [3] is + incomplete, delay may actually be estimated even if class + per-round allotment is less than MTU. Namely, if per-round + allotment is W*r_i, and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- It seems that cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translations + from NS. Anyone is advised to find these differences + and explain to me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. This is wrong because of + internal device queueing, but on a permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to an ideal solution. */ + +struct cbq_sched_data; + + +struct cbq_class +{ + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Parameters */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; +#endif + + u32 defmap; + + /* Link-sharing scheduler parameters */ + long maxidle; /* Class parameters: see below. */ + long offtime; + long minidle; + u32 avpkt; + struct qdisc_rate_table *R_tab; + + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; + + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ + + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ + + struct Qdisc *q; /* Elementary queueing discipline */ + + +/* Variables */ + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + unsigned long penalized; + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + spinlock_t *stats_lock; + struct tc_cbq_xstats xstats; + + struct tcf_proto *filter_list; + + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; + +struct cbq_sched_data +{ + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; + + unsigned activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_class *rx_class; +#endif + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; + psched_time_t now; /* Cached timestamp */ + psched_time_t now_rt; /* Cached real time */ + unsigned pmask; + + struct timer_list delay_timer; + struct timer_list wd_timer; /* Watchdog timer, + started when CBQ has + backlog, but cannot + transmit just now */ + long wd_expires; + int toplevel; + u32 hgenerator; +}; + + +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) + + +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} + +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; + + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} + +#ifdef CONFIG_NET_CLS_POLICE + +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; + + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packets are classified + by logical priority, or a more specific classifier may be attached + to the split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + *qerr = NET_XMIT_DROP; + for (;;) { + int result = 0; + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL || cl->level >= head->level) + goto fallback; + } + +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + switch (result) { + case TC_POLICE_RECLASSIFY: + return cbq_reclassify(skb, cl); + case TC_POLICE_SHOT: + return NULL; + default: + break; + } +#endif + if (cl->level == 0) + return cl; + + /* + * Step 3+n. If classifier selected a link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit a leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} + +/* + A packet has just been enqueued on the empty class. + cbq_activate_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_activate_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + int prio = cl->cpriority; + struct cbq_class *cl_tail; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; + } else { + cl->next_alive = cl; + q->activemask |= (1<<prio); + } +} + +/* + Unlink class from active chain. + Note that this same procedure is done directly in cbq_dequeue* + during round-robin procedure. + */ + +static void cbq_deactivate_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + return; + } + } + + cl = cl_prev->next_alive; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + int toplevel = q->toplevel; + + if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + PSCHED_TADD2(q->now, incr, now); + + do { + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = cl->level; + return; + } + } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + int len = skb->len; + int ret; + struct cbq_class *cl = cbq_classify(skb, sch, &ret); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; +#endif + if (cl == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + +#ifdef CONFIG_NET_CLS_POLICE + cl->q->__parent = sch; +#endif + if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return ret; + } + + sch->qstats.drops++; + cbq_mark_toplevel(q, cl); + cl->qstats.drops++; + return ret; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int ret; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; + } + q->tx_class = NULL; + + cbq_mark_toplevel(q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; + cl->q->__parent = sch; +#endif + if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + cl->qstats.drops++; + return ret; +} + +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay <= 0) + delay = 1; + PSCHED_TADD2(q->now, delay, cl->undertime); + + cl->xstats.overactions++; + cl->delayed = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + + /* Dirty work! We must schedule wakeups based on + real available rate, rather than leaf rate, + which may be tiny (even zero). + */ + if (q->toplevel == TC_CBQ_MAXLEVEL) { + struct cbq_class *b; + psched_tdiff_t base_delay = q->wd_expires; + + for (b = cl->borrow; b; b = b->borrow) { + delay = PSCHED_TDIFF(b->undertime, q->now); + if (delay < base_delay) { + if (delay <= 0) + delay = 1; + base_delay = delay; + } + } + + q->wd_expires = base_delay; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this = cl; + + do { + if (cl->level > q->toplevel) { + cl = NULL; + break; + } + } while ((cl = cl->borrow) != NULL); + + if (cl == NULL) + cl = this; + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + unsigned long sched = jiffies; + + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<<TC_CBQ_MAXPRIO); + if (del_timer(&q->delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + return; + } + delay = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<<cl->cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = qdisc_priv(sch); + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1<<prio); + + tmp = cbq_undelay_prio(q, prio); + if (tmp > 0) { + q->pmask |= 1<<prio; + if (tmp < delay || delay == 0) + delay = tmp; + } + } + + if (delay) { + q->delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->__parent; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = q->rx_class; + + q->rx_class = NULL; + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + + cbq_mark_toplevel(q, cl); + + q->rx_class = cl; + cl->q->__parent = sch; + + if (cl->q->enqueue(skb, cl->q) == 0) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + return 0; + } + + sch->qstats.drops++; + return -1; +} +#endif + +/* + It is mission critical procedure. + + We "regenerate" toplevel cutoff, if transmitting class + has backlog and it is not regulated. It is not part of + original CBQ description, but looks more reasonable. + Probably, it is wrong. This question needs further investigation. +*/ + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, + struct cbq_class *borrowed) +{ + if (cl && q->toplevel >= borrowed->level) { + if (cl->q->q.qlen > 1) { + do { + if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + q->toplevel = borrowed->level; + return; + } + } while ((borrowed=borrowed->borrow) != NULL); + } +#if 0 + /* It is not necessary now. Uncommenting it + will save CPU cycles, but decrease fairness. + */ + q->toplevel = TC_CBQ_MAXLEVEL; +#endif + } +} + +static void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *this = q->tx_class; + struct cbq_class *cl = this; + int len = q->tx_len; + + q->tx_class = NULL; + + for ( ; cl; cl = cl->share) { + long avgidle = cl->avgidle; + long idle; + + cl->bstats.packets++; + cl->bstats.bytes += len; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last); + if ((unsigned long)idle > 128*1024*1024) { + avgidle = cl->maxidle; + } else { + idle -= L2T(cl, len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-ewma_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->ewma_log); + } + + if (avgidle <= 0) { + /* Overlimit or at-limit */ + + if (avgidle < cl->minidle) + avgidle = cl->minidle; + + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + + /* + That is not all. + To maintain the rate allocated to the class, + we add to undertime virtual clock, + necessary to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) + */ + + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + + PSCHED_TADD2(q->now, idle, cl->undertime); + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + + cbq_update_toplevel(q, this, q->tx_borrowed); +} + +static __inline__ struct cbq_class * +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this_cl = cl; + + if (cl->tparent == NULL) + return cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + !PSCHED_TLESS(q->now, cl->undertime)) { + cl->delayed = 0; + return cl; + } + + do { + /* It is very suspicious place. Now overlimit + action is generated for not bounded classes + only if link is completely congested. + Though it is in agree with ancestor-only paradigm, + it looks very stupid. Particularly, + it means that this chunk of code will either + never be called or result in strong amplification + of burstiness. Dangerous, silly, and, however, + no another solution exists. + */ + if ((cl = cl->borrow) == NULL) { + this_cl->qstats.overlimits++; + this_cl->overlimit(this_cl); + return NULL; + } + if (cl->level > q->toplevel) + return NULL; + } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)); + + cl->delayed = 0; + return cl; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + struct cbq_class *borrow = cl; + + if (cl->q->q.qlen && + (borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per + this round. Switch to the next one. + */ + deficit = 1; + cl->deficit += cl->quantum; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur even if cl->q->q.qlen != 0 + f.e. if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + if (borrow != cl) { +#ifndef CBQ_XSTATS_BORROWS_BYTES + borrow->xstats.borrows++; + cl->xstats.borrows++; +#else + borrow->xstats.borrows += skb->len; + cl->xstats.borrows += skb->len; +#endif + } + q->tx_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + if (cl->q->q.qlen) + cbq_activate_class(cl); + return NULL; + } + + q->active[prio] = cl_tail; + } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask&0xFF; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<<prio); + skb = cbq_dequeue_prio(sch, prio); + if (skb) + return skb; + } + return NULL; +} + +static struct sk_buff * +cbq_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct cbq_sched_data *q = qdisc_priv(sch); + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + + if (q->tx_class) { + psched_tdiff_t incr2; + /* Time integrator. We calculate EOS time + by adding expected packet transmission time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + incr2 = L2T(&q->link, q->tx_len); + PSCHED_TADD(q->now, incr2); + cbq_update(q); + if ((incr -= incr2) < 0) + incr = 0; + } + PSCHED_TADD(q->now, incr); + q->now_rt = now; + + for (;;) { + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + /* All the classes are overlimit. + + It is possible, if: + + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because it requires + two passes, but it is unavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen) { + sch->qstats.overlimits++; + if (q->wd_expires) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); + if (delay <= 0) + delay = 1; + mod_timer(&q->wd_timer, jiffies + delay); + sch->flags |= TCQ_F_THROTTLED; + } + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + if (this == NULL) + return; + + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + unsigned h; + + if (q->quanta[prio] == 0) + return; + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); + cl->quantum = cl->qdisc->dev->mtu/2 + 1; + } + } + } +} + +static void cbq_sync_defmap(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *split = cl->split; + unsigned h; + int i; + + if (split == NULL) + return; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + split->defaults[i] = NULL; + } + + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; + + if (split->defaults[i]) + continue; + + for (h=0; h<16; h++) { + struct cbq_class *c; + + for (c = q->classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<<i)) { + split->defaults[i] = c; + level = c->level; + } + } + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; + } + + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; + } + + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); + + cbq_sync_defmap(cl); +} + +static void cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); + + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); + } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; + + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; + } +} + +static unsigned int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl, *cl_head; + int prio; + unsigned int len; + + for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { + if ((cl_head = q->active[prio]) == NULL) + continue; + + cl = cl_head; + do { + if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { + sch->q.qlen--; + return len; + } + } while ((cl = cl->next_alive) != cl_head); + } + return 0; +} + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int prio; + unsigned h; + + q->activemask = 0; + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) + q->active[prio] = NULL; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = cl->maxidle; + cl->deficit = cl->quantum; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + cl->maxidle = lss->maxidle; + cl->avgidle = lss->maxidle; + } + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (cl->q->handle) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) + return -EINVAL; + + q->link.refcnt = 1; + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + q->link.stats_lock = &sch->dev->queue_lock; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + q->link.xstats.avgidle = q->link.avgidle; + return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + tcm->tcm_info = cl->q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->qstats.qlen = cl->q->q.qlen; + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || +#endif + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; + } + return -ENOENT; +} + +static struct Qdisc * +cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + return cl ? cl->q : NULL; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tcf_destroy(tp); + } +} + +static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + BUG_TRAP(!cl->filters); + + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&cl->bstats, &cl->rate_est); +#endif + if (cl != &q->link) + kfree(cl); +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + unsigned h; + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = NULL; +#endif + /* + * Filters must be destroyed first because we don't destroy the + * classes from root to leafs which means that filters can still + * be bound to classes which have been destroyed already. --TGR '04 + */ + for (h = 0; h < 16; h++) + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + + for (h = 0; h < 16; h++) { + struct cbq_class *next; + + for (cl = q->classes[h]; cl; cl = next) { + next = cl->next; + cbq_destroy_class(sch, cl); + } + } +} + +static void cbq_put(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) { +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_sched_data *q = qdisc_priv(sch); + + spin_lock_bh(&sch->dev->queue_lock); + if (q->rx_class == cl) + q->rx_class = NULL; + spin_unlock_bh(&sch->dev->queue_lock); +#endif + + cbq_destroy_class(sch, cl); + } +} + +static int +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + } + + /* Change class parameters */ + sch_tree_lock(sch); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; + } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + cl->stats_lock = &sch->dev->queue_lock; + + sch_tree_lock(sch); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cbq_adjust_levels(parent); + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; +} + +static int cbq_delete(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + sch_tree_lock(sch); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + if (q->tx_class == cl) { + q->tx_class = NULL; + q->tx_borrowed = NULL; + } +#ifdef CONFIG_NET_CLS_POLICE + if (q->rx_class == cl) + q->rx_class = NULL; +#endif + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); + sch_tree_unlock(sch); + + if (--cl->refcnt == 0) + cbq_destroy_class(sch, cl); + + return 0; +} + +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + if (p && p->level <= cl->level) + return 0; + cl->filters++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops cbq_class_ops = { + .graft = cbq_graft, + .leaf = cbq_leaf, + .get = cbq_get, + .put = cbq_put, + .change = cbq_change_class, + .delete = cbq_delete, + .walk = cbq_walk, + .tcf_chain = cbq_find_tcf, + .bind_tcf = cbq_bind_filter, + .unbind_tcf = cbq_unbind_filter, + .dump = cbq_dump_class, + .dump_stats = cbq_dump_class_stats, +}; + +static struct Qdisc_ops cbq_qdisc_ops = { + .next = NULL, + .cl_ops = &cbq_class_ops, + .id = "cbq", + .priv_size = sizeof(struct cbq_sched_data), + .enqueue = cbq_enqueue, + .dequeue = cbq_dequeue, + .requeue = cbq_requeue, + .drop = cbq_drop, + .init = cbq_init, + .reset = cbq_reset, + .destroy = cbq_destroy, + .change = NULL, + .dump = cbq_dump, + .dump_stats = cbq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cbq_module_init(void) +{ + return register_qdisc(&cbq_qdisc_ops); +} +static void __exit cbq_module_exit(void) +{ + unregister_qdisc(&cbq_qdisc_ops); +} +module_init(cbq_module_init) +module_exit(cbq_module_exit) +MODULE_LICENSE("GPL"); |