diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-06 17:22:09 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-06 17:22:09 -0800 |
commit | 9753dfe19a85e7e45a34a56f4cb2048bb4f50e27 (patch) | |
tree | c017a1b4a70b8447c71b01d8b320e071546b5c9d /net/sched | |
parent | edf7c8148ec40c0fd27c0ef3f688defcc65e3913 (diff) | |
parent | 9f42f126154786e6e76df513004800c8c633f020 (diff) | |
download | linux-9753dfe19a85e7e45a34a56f4cb2048bb4f50e27.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1958 commits)
net: pack skb_shared_info more efficiently
net_sched: red: split red_parms into parms and vars
net_sched: sfq: extend limits
cnic: Improve error recovery on bnx2x devices
cnic: Re-init dev->stats_addr after chip reset
net_sched: Bug in netem reordering
bna: fix sparse warnings/errors
bna: make ethtool_ops and strings const
xgmac: cleanups
net: make ethtool_ops const
vmxnet3" make ethtool ops const
xen-netback: make ops structs const
virtio_net: Pass gfp flags when allocating rx buffers.
ixgbe: FCoE: Add support for ndo_get_fcoe_hbainfo() call
netdev: FCoE: Add new ndo_get_fcoe_hbainfo() call
igb: reset PHY after recovering from PHY power down
igb: add basic runtime PM support
igb: Add support for byte queue limits.
e1000: cleanup CE4100 MDIO registers access
e1000: unmap ce4100_gbe_mdio_base_virt in e1000_remove
...
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/cls_flow.c | 182 | ||||
-rw-r--r-- | net/sched/sch_api.c | 14 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 161 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 9 | ||||
-rw-r--r-- | net/sched/sch_gred.c | 81 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 10 | ||||
-rw-r--r-- | net/sched/sch_multiq.c | 6 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 278 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 17 | ||||
-rw-r--r-- | net/sched/sch_red.c | 57 | ||||
-rw-r--r-- | net/sched/sch_sfb.c | 17 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 369 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 1 | ||||
-rw-r--r-- | net/sched/sch_teql.c | 8 |
14 files changed, 648 insertions, 562 deletions
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 7b582300d051..1d8bd0dbcd1f 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -26,6 +26,8 @@ #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> +#include <net/flow_keys.h> + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> #endif @@ -66,134 +68,37 @@ static inline u32 addr_fold(void *addr) return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); } -static u32 flow_get_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - __be32 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - saddr), - 4, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - saddr.s6_addr32[3]), - 4, &hdata); - break; - } - - if (data) - return ntohl(*data); + if (flow->src) + return ntohl(flow->src); return addr_fold(skb->sk); } -static u32 flow_get_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - __be32 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - daddr), - 4, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - daddr.s6_addr32[3]), - 4, &hdata); - break; - } - - if (data) - return ntohl(*data); + if (flow->dst) + return ntohl(flow->dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } -static u32 flow_get_proto(const struct sk_buff *skb, int nhoff) +static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - __u8 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - protocol), - 1, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - nexthdr), - 1, &hdata); - break; - } - if (data) - return *data; - return 0; + return flow->ip_proto; } -/* helper function to get either src or dst port */ -static __be16 *flow_get_proto_common(const struct sk_buff *skb, int nhoff, - __be16 *_port, int dst) +static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - __be16 *port = NULL; - int poff; - - switch (skb->protocol) { - case htons(ETH_P_IP): { - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - break; - if (ip_is_fragment(iph)) - break; - poff = proto_ports_offset(iph->protocol); - if (poff >= 0) - port = skb_header_pointer(skb, - nhoff + iph->ihl * 4 + poff + dst, - sizeof(*_port), _port); - break; - } - case htons(ETH_P_IPV6): { - struct ipv6hdr *iph, _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - break; - poff = proto_ports_offset(iph->nexthdr); - if (poff >= 0) - port = skb_header_pointer(skb, - nhoff + sizeof(*iph) + poff + dst, - sizeof(*_port), _port); - break; - } - } - - return port; -} - -static u32 flow_get_proto_src(const struct sk_buff *skb, int nhoff) -{ - __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 0); - - if (port) - return ntohs(*port); + if (flow->ports) + return ntohs(flow->port16[0]); return addr_fold(skb->sk); } -static u32 flow_get_proto_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 2); - - if (port) - return ntohs(*port); + if (flow->ports) + return ntohs(flow->port16[1]); return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } @@ -239,7 +144,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) }) #endif -static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb->protocol) { case htons(ETH_P_IP): @@ -248,10 +153,10 @@ static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff) return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: - return flow_get_src(skb, nhoff); + return flow_get_src(skb, flow); } -static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb->protocol) { case htons(ETH_P_IP): @@ -260,21 +165,21 @@ static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff) return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: - return flow_get_dst(skb, nhoff); + return flow_get_dst(skb, flow); } -static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: - return flow_get_proto_src(skb, nhoff); + return flow_get_proto_src(skb, flow); } -static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: - return flow_get_proto_dst(skb, nhoff); + return flow_get_proto_dst(skb, flow); } static u32 flow_get_rtclassid(const struct sk_buff *skb) @@ -314,21 +219,19 @@ static u32 flow_get_rxhash(struct sk_buff *skb) return skb_get_rxhash(skb); } -static u32 flow_key_get(struct sk_buff *skb, int key) +static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) { - int nhoff = skb_network_offset(skb); - switch (key) { case FLOW_KEY_SRC: - return flow_get_src(skb, nhoff); + return flow_get_src(skb, flow); case FLOW_KEY_DST: - return flow_get_dst(skb, nhoff); + return flow_get_dst(skb, flow); case FLOW_KEY_PROTO: - return flow_get_proto(skb, nhoff); + return flow_get_proto(skb, flow); case FLOW_KEY_PROTO_SRC: - return flow_get_proto_src(skb, nhoff); + return flow_get_proto_src(skb, flow); case FLOW_KEY_PROTO_DST: - return flow_get_proto_dst(skb, nhoff); + return flow_get_proto_dst(skb, flow); case FLOW_KEY_IIF: return flow_get_iif(skb); case FLOW_KEY_PRIORITY: @@ -338,13 +241,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key) case FLOW_KEY_NFCT: return flow_get_nfct(skb); case FLOW_KEY_NFCT_SRC: - return flow_get_nfct_src(skb, nhoff); + return flow_get_nfct_src(skb, flow); case FLOW_KEY_NFCT_DST: - return flow_get_nfct_dst(skb, nhoff); + return flow_get_nfct_dst(skb, flow); case FLOW_KEY_NFCT_PROTO_SRC: - return flow_get_nfct_proto_src(skb, nhoff); + return flow_get_nfct_proto_src(skb, flow); case FLOW_KEY_NFCT_PROTO_DST: - return flow_get_nfct_proto_dst(skb, nhoff); + return flow_get_nfct_proto_dst(skb, flow); case FLOW_KEY_RTCLASSID: return flow_get_rtclassid(skb); case FLOW_KEY_SKUID: @@ -361,6 +264,16 @@ static u32 flow_key_get(struct sk_buff *skb, int key) } } +#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ + (1 << FLOW_KEY_DST) | \ + (1 << FLOW_KEY_PROTO) | \ + (1 << FLOW_KEY_PROTO_SRC) | \ + (1 << FLOW_KEY_PROTO_DST) | \ + (1 << FLOW_KEY_NFCT_SRC) | \ + (1 << FLOW_KEY_NFCT_DST) | \ + (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ + (1 << FLOW_KEY_NFCT_PROTO_DST)) + static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -372,17 +285,20 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, int r; list_for_each_entry(f, &head->filters, list) { - u32 keys[f->nkeys]; + u32 keys[FLOW_KEY_MAX + 1]; + struct flow_keys flow_keys; if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; keymask = f->keymask; + if (keymask & FLOW_KEYS_NEEDED) + skb_flow_dissect(skb, &flow_keys); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; keymask &= ~(1 << key); - keys[n] = flow_key_get(skb, key); + keys[n] = flow_key_get(skb, key, &flow_keys); } if (f->mode == FLOW_MODE_HASH) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index dca6c1a576f7..3d8981fde301 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -618,20 +618,24 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, } EXPORT_SYMBOL(qdisc_class_hash_remove); -/* Allocate an unique handle from space managed by kernel */ - +/* Allocate an unique handle from space managed by kernel + * Possible range is [8000-FFFF]:0000 (0x8000 values) + */ static u32 qdisc_alloc_handle(struct net_device *dev) { - int i = 0x10000; + int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); - } while (qdisc_lookup(dev, autohandle) && --i > 0); + if (!qdisc_lookup(dev, autohandle)) + return autohandle; + cond_resched(); + } while (--i > 0); - return i > 0 ? autohandle : 0; + return 0; } void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 3422b25df9e4..e465064d39a3 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -19,10 +19,7 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/red.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/ipv6.h> -#include <net/ipv6.h> +#include <net/flow_keys.h> /* CHOKe stateless AQM for fair bandwidth allocation @@ -60,6 +57,7 @@ struct choke_sched_data { struct red_parms parms; /* Variables */ + struct red_vars vars; struct tcf_proto *filter_list; struct { u32 prob_drop; /* Early probability drops */ @@ -142,85 +140,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* - * Compare flow of two packets - * Returns true only if source and destination address and port match. - * false for special cases - */ -static bool choke_match_flow(struct sk_buff *skb1, - struct sk_buff *skb2) -{ - int off1, off2, poff; - const u32 *ports1, *ports2; - u8 ip_proto; - __u32 hash1; - - if (skb1->protocol != skb2->protocol) - return false; - - /* Use hash value as quick check - * Assumes that __skb_get_rxhash makes IP header and ports linear - */ - hash1 = skb_get_rxhash(skb1); - if (!hash1 || hash1 != skb_get_rxhash(skb2)) - return false; - - /* Probably match, but be sure to avoid hash collisions */ - off1 = skb_network_offset(skb1); - off2 = skb_network_offset(skb2); - - switch (skb1->protocol) { - case __constant_htons(ETH_P_IP): { - const struct iphdr *ip1, *ip2; - - ip1 = (const struct iphdr *) (skb1->data + off1); - ip2 = (const struct iphdr *) (skb2->data + off2); - - ip_proto = ip1->protocol; - if (ip_proto != ip2->protocol || - ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr) - return false; - - if (ip_is_fragment(ip1) | ip_is_fragment(ip2)) - ip_proto = 0; - off1 += ip1->ihl * 4; - off2 += ip2->ihl * 4; - break; - } - - case __constant_htons(ETH_P_IPV6): { - const struct ipv6hdr *ip1, *ip2; - - ip1 = (const struct ipv6hdr *) (skb1->data + off1); - ip2 = (const struct ipv6hdr *) (skb2->data + off2); - - ip_proto = ip1->nexthdr; - if (ip_proto != ip2->nexthdr || - ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) || - ipv6_addr_cmp(&ip1->daddr, &ip2->daddr)) - return false; - off1 += 40; - off2 += 40; - } - - default: /* Maybe compare MAC header here? */ - return false; - } - - poff = proto_ports_offset(ip_proto); - if (poff < 0) - return true; - - off1 += poff; - off2 += poff; - - ports1 = (__force u32 *)(skb1->data + off1); - ports2 = (__force u32 *)(skb2->data + off2); - return *ports1 == *ports2; -} - struct choke_skb_cb { - u16 classid; + u16 classid; + u8 keys_valid; + struct flow_keys keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -241,6 +164,32 @@ static u16 choke_get_classid(const struct sk_buff *skb) } /* + * Compare flow of two packets + * Returns true only if source and destination address and port match. + * false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, + struct sk_buff *skb2) +{ + if (skb1->protocol != skb2->protocol) + return false; + + if (!choke_skb_cb(skb1)->keys_valid) { + choke_skb_cb(skb1)->keys_valid = 1; + skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); + } + + if (!choke_skb_cb(skb2)->keys_valid) { + choke_skb_cb(skb2)->keys_valid = 1; + skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); + } + + return !memcmp(&choke_skb_cb(skb1)->keys, + &choke_skb_cb(skb2)->keys, + sizeof(struct flow_keys)); +} + +/* * Classify flow using either: * 1. pre-existing classification result in skb * 2. fast internal classification @@ -317,7 +266,7 @@ static bool choke_match_random(const struct choke_sched_data *q, static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); - struct red_parms *p = &q->parms; + const struct red_parms *p = &q->parms; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (q->filter_list) { @@ -326,14 +275,15 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto other_drop; /* Packet was eaten by filter */ } + choke_skb_cb(skb)->keys_valid = 0; /* Compute average queue usage (see RED) */ - p->qavg = red_calc_qavg(p, sch->q.qlen); - if (red_is_idling(p)) - red_end_of_idle_period(p); + q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); /* Is queue small? */ - if (p->qavg <= p->qth_min) - p->qcount = -1; + if (q->vars.qavg <= p->qth_min) + q->vars.qcount = -1; else { unsigned int idx; @@ -345,8 +295,8 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } /* Queue is large, always mark/drop */ - if (p->qavg > p->qth_max) { - p->qcount = -1; + if (q->vars.qavg > p->qth_max) { + q->vars.qcount = -1; sch->qstats.overlimits++; if (use_harddrop(q) || !use_ecn(q) || @@ -356,10 +306,10 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.forced_mark++; - } else if (++p->qcount) { - if (red_mark_probability(p, p->qavg)) { - p->qcount = 0; - p->qR = red_random(p); + } else if (++q->vars.qcount) { + if (red_mark_probability(p, &q->vars, q->vars.qavg)) { + q->vars.qcount = 0; + q->vars.qR = red_random(p); sch->qstats.overlimits++; if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { @@ -370,7 +320,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->stats.prob_mark++; } } else - p->qR = red_random(p); + q->vars.qR = red_random(p); } /* Admit new packet */ @@ -404,8 +354,8 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) struct sk_buff *skb; if (q->head == q->tail) { - if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); return NULL; } @@ -428,8 +378,8 @@ static unsigned int choke_drop(struct Qdisc *sch) if (len > 0) q->stats.other++; else { - if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); } return len; @@ -439,12 +389,13 @@ static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); - red_restart(&q->parms); + red_restart(&q->vars); } static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, + [TCA_CHOKE_MAX_P] = { .type = NLA_U32 }, }; @@ -466,6 +417,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) int err; struct sk_buff **old = NULL; unsigned int mask; + u32 max_P; if (opt == NULL) return -EINVAL; @@ -478,6 +430,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + ctl = nla_data(tb[TCA_CHOKE_PARMS]); if (ctl->limit > CHOKE_MAX_QUEUE) @@ -527,10 +481,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_CHOKE_STAB])); + nla_data(tb[TCA_CHOKE_STAB]), + max_P); + red_set_vars(&q->vars); if (q->head == q->tail) - red_end_of_idle_period(&q->parms); + red_end_of_idle_period(&q->vars); sch_tree_unlock(sch); choke_free(old); @@ -561,6 +517,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); + NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P); return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 69fca2798804..67fc573e013a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -60,7 +60,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) /* check the reason of requeuing without tx lock first */ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - if (!netif_tx_queue_frozen_or_stopped(txq)) { + if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else @@ -121,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, spin_unlock(root_lock); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_tx_queue_frozen_or_stopped(txq)) + if (!netif_xmit_frozen_or_stopped(txq)) ret = dev_hard_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); @@ -143,7 +143,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, ret = dev_requeue_skb(skb, q); } - if (ret && netif_tx_queue_frozen_or_stopped(txq)) + if (ret && netif_xmit_frozen_or_stopped(txq)) ret = 0; return ret; @@ -242,10 +242,11 @@ static void dev_watchdog(unsigned long arg) * old device drivers set dev->trans_start */ trans_start = txq->trans_start ? : dev->trans_start; - if (netif_tx_queue_stopped(txq) && + if (netif_xmit_stopped(txq) && time_after(jiffies, (trans_start + dev->watchdog_timeo))) { some_queue_timedout = 1; + txq->trans_timeout++; break; } } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 6cd8ddfb512d..0b15236be7b6 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -34,13 +34,14 @@ struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ - u32 DP; /* the drop pramaters */ + u32 DP; /* the drop parameters */ u32 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ struct red_parms parms; + struct red_vars vars; struct red_stats stats; }; @@ -55,7 +56,7 @@ struct gred_sched { u32 red_flags; u32 DPs; u32 def; - struct red_parms wred_set; + struct red_vars wred_set; }; static inline int gred_wred_mode(struct gred_sched *table) @@ -125,17 +126,17 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb) return skb->tc_index & GRED_VQ_MASK; } -static inline void gred_load_wred_set(struct gred_sched *table, +static inline void gred_load_wred_set(const struct gred_sched *table, struct gred_sched_data *q) { - q->parms.qavg = table->wred_set.qavg; - q->parms.qidlestart = table->wred_set.qidlestart; + q->vars.qavg = table->wred_set.qavg; + q->vars.qidlestart = table->wred_set.qidlestart; } static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { - table->wred_set.qavg = q->parms.qavg; + table->wred_set.qavg = q->vars.qavg; } static inline int gred_use_ecn(struct gred_sched *t) @@ -170,7 +171,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto drop; } - /* fix tc_index? --could be controvesial but needed for + /* fix tc_index? --could be controversial but needed for requeueing */ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } @@ -181,8 +182,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) for (i = 0; i < t->DPs; i++) { if (t->tab[i] && t->tab[i]->prio < q->prio && - !red_is_idling(&t->tab[i]->parms)) - qavg += t->tab[i]->parms.qavg; + !red_is_idling(&t->tab[i]->vars)) + qavg += t->tab[i]->vars.qavg; } } @@ -193,15 +194,17 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (gred_wred_mode(t)) gred_load_wred_set(t, q); - q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); + q->vars.qavg = red_calc_qavg(&q->parms, + &q->vars, + gred_backlog(t, q, sch)); - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); if (gred_wred_mode(t)) gred_store_wred_set(t, q); - switch (red_action(&q->parms, q->parms.qavg + qavg)) { + switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) { case RED_DONT_MARK: break; @@ -260,7 +263,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) q->backlog -= qdisc_pkt_len(skb); if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->parms); + red_start_of_idle_period(&q->vars); } return skb; @@ -293,7 +296,7 @@ static unsigned int gred_drop(struct Qdisc *sch) q->stats.other++; if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->parms); + red_start_of_idle_period(&q->vars); } qdisc_drop(skb, sch); @@ -320,7 +323,7 @@ static void gred_reset(struct Qdisc *sch) if (!q) continue; - red_restart(&q->parms); + red_restart(&q->vars); q->backlog = 0; } } @@ -379,29 +382,31 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) } static inline int gred_change_vq(struct Qdisc *sch, int dp, - struct tc_gred_qopt *ctl, int prio, u8 *stab) + struct tc_gred_qopt *ctl, int prio, + u8 *stab, u32 max_P, + struct gred_sched_data **prealloc) { struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; + struct gred_sched_data *q = table->tab[dp]; - if (table->tab[dp] == NULL) { - table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC); - if (table->tab[dp] == NULL) + if (!q) { + table->tab[dp] = q = *prealloc; + *prealloc = NULL; + if (!q) return -ENOMEM; } - q = table->tab[dp]; q->DP = dp; q->prio = prio; q->limit = ctl->limit; if (q->backlog == 0) - red_end_of_idle_period(&q->parms); + red_end_of_idle_period(&q->vars); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, - ctl->Scell_log, stab); - + ctl->Scell_log, stab, max_P); + red_set_vars(&q->vars); return 0; } @@ -409,6 +414,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, + [TCA_GRED_MAX_P] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -418,6 +424,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) struct nlattr *tb[TCA_GRED_MAX + 1]; int err, prio = GRED_DEF_PRIO; u8 *stab; + u32 max_P; + struct gred_sched_data *prealloc; if (opt == NULL) return -EINVAL; @@ -433,6 +441,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_GRED_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; + err = -EINVAL; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); @@ -455,9 +465,10 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) prio = ctl->prio; } + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc); if (err < 0) goto errout_locked; @@ -471,6 +482,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) errout_locked: sch_tree_unlock(sch); + kfree(prealloc); errout: return err; } @@ -498,6 +510,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) struct gred_sched *table = qdisc_priv(sch); struct nlattr *parms, *opts = NULL; int i; + u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { .DPs = table->DPs, .def_DP = table->def, @@ -509,6 +522,14 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + max_p[i] = q ? q->parms.max_P : 0; + } + NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p); + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -545,12 +566,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.bytesin = q->bytesin; if (gred_wred_mode(table)) { - q->parms.qidlestart = - table->tab[table->def]->parms.qidlestart; - q->parms.qavg = table->tab[table->def]->parms.qavg; + q->vars.qidlestart = + table->tab[table->def]->vars.qidlestart; + q->vars.qavg = table->tab[table->def]->vars.qavg; } - opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); + opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6488e6425652..9bdca2e011e9 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1368,6 +1368,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct tc_hfsc_stats xstats; cl->qstats.qlen = cl->qdisc->q.qlen; + cl->qstats.backlog = cl->qdisc->qstats.backlog; xstats.level = cl->level; xstats.period = cl->cl_vtperiod; xstats.work = cl->cl_total; @@ -1561,6 +1562,15 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) struct hfsc_sched *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; + struct hfsc_class *cl; + struct hlist_node *n; + unsigned int i; + + sch->qstats.backlog = 0; + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + sch->qstats.backlog += cl->qdisc->qstats.backlog; + } qopt.defcls = q->defcls; NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index edc1950e0e77..49131d7a7446 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -107,7 +107,8 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch) /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ - if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) { + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { @@ -138,7 +139,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch) /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ - if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) { + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index a4ab207cdc59..e7e1d0b57b3d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -22,6 +22,7 @@ #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> +#include <linux/reciprocal_div.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -66,7 +67,11 @@ */ struct netem_sched_data { + /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ + + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; + struct qdisc_watchdog watchdog; psched_tdiff_t latency; @@ -79,6 +84,11 @@ struct netem_sched_data { u32 duplicate; u32 reorder; u32 corrupt; + u32 rate; + s32 packet_overhead; + u32 cell_size; + u32 cell_size_reciprocal; + s32 cell_overhead; struct crndstate { u32 last; @@ -111,7 +121,9 @@ struct netem_sched_data { }; -/* Time stamp put into socket buffer control block */ +/* Time stamp put into socket buffer control block + * Only valid when skbs are in our internal t(ime)fifo queue. + */ struct netem_skb_cb { psched_time_t time_to_send; }; @@ -298,6 +310,51 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } +static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +{ + u64 ticks; + + len += q->packet_overhead; + + if (q->cell_size) { + u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); + + if (len > cells * q->cell_size) /* extra cell needed for remainder */ + cells++; + len = cells * (q->cell_size + q->cell_overhead); + } + + ticks = (u64)len * NSEC_PER_SEC; + + do_div(ticks, q->rate); + return PSCHED_NS2TICKS(ticks); +} + +static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct sk_buff_head *list = &sch->q; + psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + struct sk_buff *skb; + + if (likely(skb_queue_len(list) < sch->limit)) { + skb = skb_peek_tail(list); + /* Optimize for add at tail */ + if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) + return qdisc_enqueue_tail(nskb, sch); + + skb_queue_reverse_walk(list, skb) { + if (tnext >= netem_skb_cb(skb)->time_to_send) + break; + } + + __skb_queue_after(list, skb, nskb); + sch->qstats.backlog += qdisc_pkt_len(nskb); + return NET_XMIT_SUCCESS; + } + + return qdisc_reshape_fail(nskb, sch); +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -371,9 +428,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) &q->delay_cor, q->delay_dist); now = psched_get_time(); + + if (q->rate) { + struct sk_buff_head *list = &sch->q; + + delay += packet_len_2_sched_time(skb->len, q); + + if (!skb_queue_empty(list)) { + /* + * Last packet in queue is reference point (now). + * First packet in queue is already in flight, + * calculate this time bonus and substract + * from delay. + */ + delay -= now - netem_skb_cb(skb_peek(list))->time_to_send; + now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + } + } + cb->time_to_send = now + delay; ++q->counter; - ret = qdisc_enqueue(skb, q->qdisc); + ret = tfifo_enqueue(skb, sch); } else { /* * Do re-ordering by putting one out of N packets at the front @@ -382,9 +457,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) cb->time_to_send = psched_get_time(); q->counter = 0; - __skb_queue_head(&q->qdisc->q, skb); - q->qdisc->qstats.backlog += qdisc_pkt_len(skb); - q->qdisc->qstats.requeues++; + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + sch->qstats.requeues++; ret = NET_XMIT_SUCCESS; } @@ -395,19 +470,20 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - sch->q.qlen++; return NET_XMIT_SUCCESS; } static unsigned int netem_drop(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - unsigned int len = 0; + unsigned int len; - if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { - sch->q.qlen--; + len = qdisc_queue_drop(sch); + if (!len && q->qdisc && q->qdisc->ops->drop) + len = q->qdisc->ops->drop(q->qdisc); + if (len) sch->qstats.drops++; - } + return len; } @@ -419,16 +495,16 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) if (qdisc_is_throttled(sch)) return NULL; - skb = q->qdisc->ops->peek(q->qdisc); +tfifo_dequeue: + skb = qdisc_peek_head(sch); if (skb) { const struct netem_skb_cb *cb = netem_skb_cb(skb); - psched_time_t now = psched_get_time(); /* if more time remaining? */ - if (cb->time_to_send <= now) { - skb = qdisc_dequeue_peeked(q->qdisc); + if (cb->time_to_send <= psched_get_time()) { + skb = qdisc_dequeue_tail(sch); if (unlikely(!skb)) - return NULL; + goto qdisc_dequeue; #ifdef CONFIG_NET_CLS_ACT /* @@ -439,15 +515,37 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) skb->tstamp.tv64 = 0; #endif - sch->q.qlen--; + if (q->qdisc) { + int err = qdisc_enqueue(skb, q->qdisc); + + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + sch->qstats.drops++; + qdisc_tree_decrease_qlen(sch, 1); + } + } + goto tfifo_dequeue; + } +deliver: qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } + if (q->qdisc) { + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) + goto deliver; + } qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); } +qdisc_dequeue: + if (q->qdisc) { + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) + goto deliver; + } return NULL; } @@ -455,8 +553,9 @@ static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - qdisc_reset(q->qdisc); - sch->q.qlen = 0; + qdisc_reset_queue(sch); + if (q->qdisc) + qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); } @@ -536,6 +635,19 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->corrupt_cor, r->correlation); } +static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_rate *r = nla_data(attr); + + q->rate = r->rate; + q->packet_overhead = r->packet_overhead; + q->cell_size = r->cell_size; + if (q->cell_size) + q->cell_size_reciprocal = reciprocal_value(q->cell_size); + q->cell_overhead = r->cell_overhead; +} + static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -549,7 +661,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); - if (nla_len(la) != sizeof(struct tc_netem_gimodel)) { + if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { pr_info("netem: incorrect gi model size\n"); return -EINVAL; } @@ -568,8 +680,8 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) case NETEM_LOSS_GE: { const struct tc_netem_gemodel *ge = nla_data(la); - if (nla_len(la) != sizeof(struct tc_netem_gemodel)) { - pr_info("netem: incorrect gi model size\n"); + if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { + pr_info("netem: incorrect ge model size\n"); return -EINVAL; } @@ -595,6 +707,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, }; @@ -632,11 +745,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (ret < 0) return ret; - ret = fifo_set_limit(q->qdisc, qopt->limit); - if (ret) { - pr_info("netem: can't set fifo limit\n"); - return ret; - } + sch->limit = qopt->limit; q->latency = qopt->latency; q->jitter = qopt->jitter; @@ -667,6 +776,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_CORRUPT]) get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + if (tb[TCA_NETEM_RATE]) + get_rate(sch, tb[TCA_NETEM_RATE]); + q->loss_model = CLG_RANDOM; if (tb[TCA_NETEM_LOSS]) ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); @@ -674,88 +786,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) return ret; } -/* - * Special case version of FIFO queue for use by netem. - * It queues in order based on timestamps in skb's - */ -struct fifo_sched_data { - u32 limit; - psched_time_t oldest; -}; - -static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) -{ - struct fifo_sched_data *q = qdisc_priv(sch); - struct sk_buff_head *list = &sch->q; - psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; - struct sk_buff *skb; - - if (likely(skb_queue_len(list) < q->limit)) { - /* Optimize for add at tail */ - if (likely(skb_queue_empty(list) || tnext >= q->oldest)) { - q->oldest = tnext; - return qdisc_enqueue_tail(nskb, sch); - } - - skb_queue_reverse_walk(list, skb) { - const struct netem_skb_cb *cb = netem_skb_cb(skb); - - if (tnext >= cb->time_to_send) - break; - } - - __skb_queue_after(list, skb, nskb); - - sch->qstats.backlog += qdisc_pkt_len(nskb); - - return NET_XMIT_SUCCESS; - } - - return qdisc_reshape_fail(nskb, sch); -} - -static int tfifo_init(struct Qdisc *sch, struct nlattr *opt) -{ - struct fifo_sched_data *q = qdisc_priv(sch); - - if (opt) { - struct tc_fifo_qopt *ctl = nla_data(opt); - if (nla_len(opt) < sizeof(*ctl)) - return -EINVAL; - - q->limit = ctl->limit; - } else - q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); - - q->oldest = PSCHED_PASTPERFECT; - return 0; -} - -static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct fifo_sched_data *q = qdisc_priv(sch); - struct tc_fifo_qopt opt = { .limit = q->limit }; - - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); - return skb->len; - -nla_put_failure: - return -1; -} - -static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = { - .id = "tfifo", - .priv_size = sizeof(struct fifo_sched_data), - .enqueue = tfifo_enqueue, - .dequeue = qdisc_dequeue_head, - .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, - .init = tfifo_init, - .reset = qdisc_reset_queue, - .change = tfifo_init, - .dump = tfifo_dump, -}; - static int netem_init(struct Qdisc *sch, struct nlattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -767,18 +797,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); q->loss_model = CLG_RANDOM; - q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops, - TC_H_MAKE(sch->handle, 1)); - if (!q->qdisc) { - pr_notice("netem: qdisc create tfifo qdisc failed\n"); - return -ENOMEM; - } - ret = netem_change(sch, opt); - if (ret) { + if (ret) pr_info("netem: change failed\n"); - qdisc_destroy(q->qdisc); - } return ret; } @@ -787,7 +808,8 @@ static void netem_destroy(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - qdisc_destroy(q->qdisc); + if (q->qdisc) + qdisc_destroy(q->qdisc); dist_free(q->delay_dist); } @@ -847,6 +869,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; + struct tc_netem_rate rate; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -869,6 +892,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) corrupt.correlation = q->corrupt_cor.rho; NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rate.rate = q->rate; + rate.packet_overhead = q->packet_overhead; + rate.cell_size = q->cell_size; + rate.cell_overhead = q->cell_overhead; + NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); + if (dump_loss_model(q, skb) != 0) goto nla_put_failure; @@ -884,7 +913,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl, { struct netem_sched_data *q = qdisc_priv(sch); - if (cl != 1) /* only one class */ + if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); @@ -898,14 +927,13 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct netem_sched_data *q = qdisc_priv(sch); - if (new == NULL) - new = &noop_qdisc; - sch_tree_lock(sch); *old = q->qdisc; q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); + if (*old) { + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + } sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 7b0325459e71..e68cb440756a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -211,6 +211,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr *tb[TCA_QFQ_MAX + 1]; u32 weight, lmax, inv_w; int i, err; + int delta_w; if (tca[TCA_OPTIONS] == NULL) { pr_notice("qfq: no options\n"); @@ -232,9 +233,10 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; - if (q->wsum + weight > QFQ_MAX_WSUM) { + delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0); + if (q->wsum + delta_w > QFQ_MAX_WSUM) { pr_notice("qfq: total weight out of range (%u + %u)\n", - weight, q->wsum); + delta_w, q->wsum); return -EINVAL; } @@ -256,13 +258,12 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } - sch_tree_lock(sch); - if (tb[TCA_QFQ_WEIGHT]) { - q->wsum = weight - ONE_FP / cl->inv_w; + if (inv_w != cl->inv_w) { + sch_tree_lock(sch); + q->wsum += delta_w; cl->inv_w = inv_w; + sch_tree_unlock(sch); } - sch_tree_unlock(sch); - return 0; } @@ -277,7 +278,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, i = qfq_calc_index(cl->inv_w, cl->lmax); cl->grp = &q->groups[i]; - q->wsum += weight; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -294,6 +294,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } } + q->wsum += weight; sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index d617161f8dd3..a5cc3012cf42 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -39,7 +39,9 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; + struct timer_list adapt_timer; struct red_parms parms; + struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; }; @@ -60,12 +62,14 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct Qdisc *child = q->qdisc; int ret; - q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog); + q->vars.qavg = red_calc_qavg(&q->parms, + &q->vars, + child->qstats.backlog); - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); - switch (red_action(&q->parms, q->parms.qavg)) { + switch (red_action(&q->parms, &q->vars, q->vars.qavg)) { case RED_DONT_MARK: break; @@ -116,8 +120,8 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) qdisc_bstats_update(sch, skb); sch->q.qlen--; } else { - if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); } return skb; } @@ -143,8 +147,8 @@ static unsigned int red_drop(struct Qdisc *sch) return len; } - if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); return 0; } @@ -155,18 +159,21 @@ static void red_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - red_restart(&q->parms); + red_restart(&q->vars); } static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + + del_timer_sync(&q->adapt_timer); qdisc_destroy(q->qdisc); } static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, + [TCA_RED_MAX_P] = { .type = NLA_U32 }, }; static int red_change(struct Qdisc *sch, struct nlattr *opt) @@ -176,6 +183,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) struct tc_red_qopt *ctl; struct Qdisc *child = NULL; int err; + u32 max_P; if (opt == NULL) return -EINVAL; @@ -188,6 +196,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_RED_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; + ctl = nla_data(tb[TCA_RED_PARMS]); if (ctl->limit > 0) { @@ -205,22 +215,42 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc = child; } - red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, - ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_RED_STAB])); + red_set_parms(&q->parms, + ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_RED_STAB]), + max_P); + red_set_vars(&q->vars); + + del_timer(&q->adapt_timer); + if (ctl->flags & TC_RED_ADAPTATIVE) + mod_timer(&q->adapt_timer, jiffies + HZ/2); if (!q->qdisc->q.qlen) - red_start_of_idle_period(&q->parms); + red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); return 0; } +static inline void red_adaptative_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct red_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + red_adaptative_algo(&q->parms, &q->vars); + mod_timer(&q->adapt_timer, jiffies + HZ/2); + spin_unlock(root_lock); +} + static int red_init(struct Qdisc *sch, struct nlattr *opt) { struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; + setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); return red_change(sch, opt); } @@ -243,6 +273,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P); return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index e83c272c0325..96e42cae4c7a 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -26,6 +26,7 @@ #include <net/ip.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> +#include <net/flow_keys.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -286,6 +287,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) u32 minqlen = ~0; u32 r, slot, salt, sfbhash; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { sch->qstats.overlimits++; @@ -309,13 +311,19 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, q, &ret, &salt)) goto other_drop; + keys.src = salt; + keys.dst = 0; + keys.ports = 0; } else { - salt = skb_get_rxhash(skb); + skb_flow_dissect(skb, &keys); } slot = q->slot; - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -347,7 +355,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 4f5510e2bd6f..0a7964009e8c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -17,14 +17,13 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> -#include <linux/ipv6.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <net/ip.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/flow_keys.h> /* Stochastic Fairness Queuing algorithm. @@ -67,16 +66,18 @@ SFQ is superior for this purpose. IMPLEMENTATION: - This implementation limits maximal queue length to 128; - max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024. - The only goal of this restrictions was that all data - fit into one 4K page on 32bit arches. + This implementation limits : + - maximal queue length per flow to 127 packets. + - max mtu to 2^18-1; + - max 65408 flows, + - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ -#define SFQ_DEPTH 128 /* max number of packets per flow */ -#define SFQ_SLOTS 128 /* max number of flows */ -#define SFQ_EMPTY_SLOT 255 +#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ +#define SFQ_DEFAULT_FLOWS 128 +#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ +#define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 /* We use 16 bits to store allot, and want to handle packets up to 64K @@ -85,13 +86,13 @@ #define SFQ_ALLOT_SHIFT 3 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) -/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */ -typedef unsigned char sfq_index; +/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ +typedef u16 sfq_index; /* * We dont use pointers to save space. - * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array - * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] + * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array + * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] * are 'pointers' to dep[] array */ struct sfq_head { @@ -103,28 +104,38 @@ struct sfq_slot { struct sk_buff *skblist_next; struct sk_buff *skblist_prev; sfq_index qlen; /* number of skbs in skblist */ - sfq_index next; /* next slot in sfq chain */ + sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ short allot; /* credit for this slot */ }; struct sfq_sched_data { -/* Parameters */ - int perturb_period; - unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ - int limit; +/* frequently used fields */ + int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ -/* Variables */ - struct tcf_proto *filter_list; - struct timer_list perturb_timer; + unsigned int maxflows; /* number of flows in flows array */ + int headdrop; + int maxdepth; /* limit of packets per flow */ + u32 perturbation; + struct tcf_proto *filter_list; sfq_index cur_depth; /* depth of longest slot */ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct sfq_slot *tail; /* current slot in round */ - sfq_index *ht; /* Hash table (divisor slots) */ - struct sfq_slot slots[SFQ_SLOTS]; - struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ + sfq_index *ht; /* Hash table ('divisor' slots) */ + struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ + + struct sfq_head dep[SFQ_MAX_DEPTH + 1]; + /* Linked lists of slots, indexed by depth + * dep[0] : list of unused flows + * dep[1] : list of flows with 1 packet + * dep[X] : list of flows with X packets + */ + + int perturb_period; + unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ + struct timer_list perturb_timer; }; /* @@ -132,66 +143,36 @@ struct sfq_sched_data { */ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { - if (val < SFQ_SLOTS) + if (val < SFQ_MAX_FLOWS) return &q->slots[val].dep; - return &q->dep[val - SFQ_SLOTS]; + return &q->dep[val - SFQ_MAX_FLOWS]; } -static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +/* + * In order to be able to quickly rehash our queue when timer changes + * q->perturbation, we store flow_keys in skb->cb[] + */ +struct sfq_skb_cb { + struct flow_keys keys; +}; + +static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) { - return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); + BUILD_BUG_ON(sizeof(skb->cb) < + sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } -static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +static unsigned int sfq_hash(const struct sfq_sched_data *q, + const struct sk_buff *skb) { - u32 h, h2; - - switch (skb->protocol) { - case htons(ETH_P_IP): - { - const struct iphdr *iph; - int poff; - - if (!pskb_network_may_pull(skb, sizeof(*iph))) - goto err; - iph = ip_hdr(skb); - h = (__force u32)iph->daddr; - h2 = (__force u32)iph->saddr ^ iph->protocol; - if (ip_is_fragment(iph)) - break; - poff = proto_ports_offset(iph->protocol); - if (poff >= 0 && - pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { - iph = ip_hdr(skb); - h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff); - } - break; - } - case htons(ETH_P_IPV6): - { - const struct ipv6hdr *iph; - int poff; - - if (!pskb_network_may_pull(skb, sizeof(*iph))) - goto err; - iph = ipv6_hdr(skb); - h = (__force u32)iph->daddr.s6_addr32[3]; - h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; - poff = proto_ports_offset(iph->nexthdr); - if (poff >= 0 && - pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { - iph = ipv6_hdr(skb); - h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff); - } - break; - } - default: -err: - h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol; - h2 = (unsigned long)skb->sk; - } + const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; + unsigned int hash; - return sfq_fold_hash(q, h, h2); + hash = jhash_3words((__force u32)keys->dst, + (__force u32)keys->src ^ keys->ip_proto, + (__force u32)keys->ports, q->perturbation); + return hash & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -206,8 +187,10 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); - if (!q->filter_list) + if (!q->filter_list) { + skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); return sfq_hash(q, skb) + 1; + } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tc_classify(skb, q->filter_list, &res); @@ -228,18 +211,19 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, } /* - * x : slot number [0 .. SFQ_SLOTS - 1] + * x : slot number [0 .. SFQ_MAX_FLOWS - 1] */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; - int qlen = q->slots[x].qlen; + struct sfq_slot *slot = &q->slots[x]; + int qlen = slot->qlen; - p = qlen + SFQ_SLOTS; + p = qlen + SFQ_MAX_FLOWS; n = q->dep[qlen].next; - q->slots[x].dep.next = n; - q->slots[x].dep.prev = p; + slot->dep.next = n; + slot->dep.prev = p; q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ sfq_dep_head(q, n)->prev = x; @@ -304,6 +288,7 @@ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) static inline void slot_queue_init(struct sfq_slot *slot) { + memset(slot, 0, sizeof(*slot)); slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; } @@ -334,7 +319,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) x = q->dep[d].next; slot = &q->slots[x]; drop: - skb = slot_dequeue_tail(slot); + skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); sfq_dec(q, x); kfree_skb(skb); @@ -378,16 +363,27 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) + return qdisc_drop(skb, sch); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } - /* If selected queue has length q->limit, do simple tail drop, - * i.e. drop _this_ packet. - */ - if (slot->qlen >= q->limit) - return qdisc_drop(skb, sch); + if (slot->qlen >= q->maxdepth) { + struct sk_buff *head; + + if (!q->headdrop) + return qdisc_drop(skb, sch); + + head = slot_dequeue_head(slot); + sch->qstats.backlog -= qdisc_pkt_len(head); + qdisc_drop(head, sch); + + sch->qstats.backlog += qdisc_pkt_len(skb); + slot_queue_add(slot, skb); + return NET_XMIT_CN; + } sch->qstats.backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); @@ -395,11 +391,11 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; + q->tail = slot; } else { slot->next = q->tail->next; q->tail->next = x; } - q->tail = slot; slot->allot = q->scaled_quantum; } if (++sch->q.qlen <= q->limit) @@ -468,12 +464,83 @@ sfq_reset(struct Qdisc *sch) kfree_skb(skb); } +/* + * When q->perturbation is changed, we rehash all queued skbs + * to avoid OOO (Out Of Order) effects. + * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change + * counters. + */ +static void sfq_rehash(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int i; + struct sfq_slot *slot; + struct sk_buff_head list; + int dropped = 0; + + __skb_queue_head_init(&list); + + for (i = 0; i < q->maxflows; i++) { + slot = &q->slots[i]; + if (!slot->qlen) + continue; + while (slot->qlen) { + skb = slot_dequeue_head(slot); + sfq_dec(q, i); + __skb_queue_tail(&list, skb); + } + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + } + q->tail = NULL; + + while ((skb = __skb_dequeue(&list)) != NULL) { + unsigned int hash = sfq_hash(q, skb); + sfq_index x = q->ht[hash]; + + slot = &q->slots[x]; + if (x == SFQ_EMPTY_SLOT) { + x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) { +drop: sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + dropped++; + continue; + } + q->ht[hash] = x; + slot = &q->slots[x]; + slot->hash = hash; + } + if (slot->qlen >= q->maxdepth) + goto drop; + slot_queue_add(slot, skb); + sfq_inc(q, x); + if (slot->qlen == 1) { /* The flow is new */ + if (q->tail == NULL) { /* It is the first flow */ + slot->next = x; + } else { + slot->next = q->tail->next; + q->tail->next = x; + } + q->tail = slot; + slot->allot = q->scaled_quantum; + } + } + sch->q.qlen -= dropped; + qdisc_tree_decrease_qlen(sch, dropped); +} + static void sfq_perturbation(unsigned long arg) { struct Qdisc *sch = (struct Qdisc *)arg; struct sfq_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); q->perturbation = net_random(); + if (!q->filter_list && q->tail) + sfq_rehash(sch); + spin_unlock(root_lock); if (q->perturb_period) mod_timer(&q->perturb_timer, jiffies + q->perturb_period); @@ -483,23 +550,39 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); + struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; - + if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) + ctl_v1 = nla_data(opt); if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; sch_tree_lock(sch); - q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + if (ctl->quantum) { + q->quantum = ctl->quantum; + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + } q->perturb_period = ctl->perturb_period * HZ; - if (ctl->limit) - q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); - if (ctl->divisor) + if (ctl->flows) + q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + if (ctl->divisor) { q->divisor = ctl->divisor; + q->maxflows = min_t(u32, q->maxflows, q->divisor); + } + if (ctl_v1) { + if (ctl_v1->depth) + q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + q->headdrop = ctl_v1->headdrop; + } + if (ctl->limit) { + q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); + q->maxflows = min_t(u32, q->maxflows, q->limit); + } + qlen = sch->q.qlen; while (sch->q.qlen > q->limit) sfq_drop(sch); @@ -514,46 +597,77 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) return 0; } +static void *sfq_alloc(size_t sz) +{ + void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vmalloc(sz); + return ptr; +} + +static void sfq_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + q->perturb_period = 0; + del_timer_sync(&q->perturb_timer); + sfq_free(q->ht); + sfq_free(q->slots); +} + static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); - size_t sz; int i; q->perturb_timer.function = sfq_perturbation; q->perturb_timer.data = (unsigned long)sch; init_timer_deferrable(&q->perturb_timer); - for (i = 0; i < SFQ_DEPTH; i++) { - q->dep[i].next = i + SFQ_SLOTS; - q->dep[i].prev = i + SFQ_SLOTS; + for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { + q->dep[i].next = i + SFQ_MAX_FLOWS; + q->dep[i].prev = i + SFQ_MAX_FLOWS; } - q->limit = SFQ_DEPTH - 1; + q->limit = SFQ_MAX_DEPTH; + q->maxdepth = SFQ_MAX_DEPTH; q->cur_depth = 0; q->tail = NULL; q->divisor = SFQ_DEFAULT_HASH_DIVISOR; - if (opt == NULL) { - q->quantum = psched_mtu(qdisc_dev(sch)); - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); - q->perturb_period = 0; - q->perturbation = net_random(); - } else { + q->maxflows = SFQ_DEFAULT_FLOWS; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + q->perturb_period = 0; + q->perturbation = net_random(); + + if (opt) { int err = sfq_change(sch, opt); if (err) return err; } - sz = sizeof(q->ht[0]) * q->divisor; - q->ht = kmalloc(sz, GFP_KERNEL); - if (!q->ht && sz > PAGE_SIZE) - q->ht = vmalloc(sz); - if (!q->ht) + q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); + q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); + if (!q->ht || !q->slots) { + sfq_destroy(sch); return -ENOMEM; + } for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; - for (i = 0; i < SFQ_SLOTS; i++) { + for (i = 0; i < q->maxflows; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } @@ -564,31 +678,20 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) return 0; } -static void sfq_destroy(struct Qdisc *sch) -{ - struct sfq_sched_data *q = qdisc_priv(sch); - - tcf_destroy_chain(&q->filter_list); - q->perturb_period = 0; - del_timer_sync(&q->perturb_timer); - if (is_vmalloc_addr(q->ht)) - vfree(q->ht); - else - kfree(q->ht); -} - static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); - struct tc_sfq_qopt opt; - - opt.quantum = q->quantum; - opt.perturb_period = q->perturb_period / HZ; - - opt.limit = q->limit; - opt.divisor = q->divisor; - opt.flows = q->limit; + struct tc_sfq_qopt_v1 opt; + + memset(&opt, 0, sizeof(opt)); + opt.v0.quantum = q->quantum; + opt.v0.perturb_period = q->perturb_period / HZ; + opt.v0.limit = q->limit; + opt.v0.divisor = q->divisor; + opt.v0.flows = q->maxflows; + opt.depth = q->maxdepth; + opt.headdrop = q->headdrop; NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 1dcfb5223a86..b8e156319d7b 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -346,6 +346,7 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; struct tc_tbf_qopt opt; + sch->qstats.backlog = q->qdisc->qstats.backlog; nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 4f4c52c0eeb3..45326599fda3 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -277,7 +277,7 @@ static inline int teql_resolve(struct sk_buff *skb, return 0; rcu_read_lock(); - mn = dst_get_neighbour(dst); + mn = dst_get_neighbour_noref(dst); res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0; rcu_read_unlock(); @@ -310,7 +310,7 @@ restart: if (slave_txq->qdisc_sleeping != q) continue; - if (__netif_subqueue_stopped(slave, subq) || + if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) || !netif_running(slave)) { busy = 1; continue; @@ -321,7 +321,7 @@ restart: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); - if (!netif_tx_queue_frozen_or_stopped(slave_txq) && + if (!netif_xmit_frozen_or_stopped(slave_txq) && slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); @@ -333,7 +333,7 @@ restart: } __netif_tx_unlock(slave_txq); } - if (netif_queue_stopped(dev)) + if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0))) busy = 1; break; case 1: |