summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-06 17:22:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-06 17:22:09 -0800
commit9753dfe19a85e7e45a34a56f4cb2048bb4f50e27 (patch)
treec017a1b4a70b8447c71b01d8b320e071546b5c9d /net/sched
parentedf7c8148ec40c0fd27c0ef3f688defcc65e3913 (diff)
parent9f42f126154786e6e76df513004800c8c633f020 (diff)
downloadlinux-9753dfe19a85e7e45a34a56f4cb2048bb4f50e27.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1958 commits) net: pack skb_shared_info more efficiently net_sched: red: split red_parms into parms and vars net_sched: sfq: extend limits cnic: Improve error recovery on bnx2x devices cnic: Re-init dev->stats_addr after chip reset net_sched: Bug in netem reordering bna: fix sparse warnings/errors bna: make ethtool_ops and strings const xgmac: cleanups net: make ethtool_ops const vmxnet3" make ethtool ops const xen-netback: make ops structs const virtio_net: Pass gfp flags when allocating rx buffers. ixgbe: FCoE: Add support for ndo_get_fcoe_hbainfo() call netdev: FCoE: Add new ndo_get_fcoe_hbainfo() call igb: reset PHY after recovering from PHY power down igb: add basic runtime PM support igb: Add support for byte queue limits. e1000: cleanup CE4100 MDIO registers access e1000: unmap ce4100_gbe_mdio_base_virt in e1000_remove ...
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/cls_flow.c182
-rw-r--r--net/sched/sch_api.c14
-rw-r--r--net/sched/sch_choke.c161
-rw-r--r--net/sched/sch_generic.c9
-rw-r--r--net/sched/sch_gred.c81
-rw-r--r--net/sched/sch_hfsc.c10
-rw-r--r--net/sched/sch_multiq.c6
-rw-r--r--net/sched/sch_netem.c278
-rw-r--r--net/sched/sch_qfq.c17
-rw-r--r--net/sched/sch_red.c57
-rw-r--r--net/sched/sch_sfb.c17
-rw-r--r--net/sched/sch_sfq.c369
-rw-r--r--net/sched/sch_tbf.c1
-rw-r--r--net/sched/sch_teql.c8
14 files changed, 648 insertions, 562 deletions
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 7b582300d051..1d8bd0dbcd1f 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -26,6 +26,8 @@
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/flow_keys.h>
+
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netfilter/nf_conntrack.h>
#endif
@@ -66,134 +68,37 @@ static inline u32 addr_fold(void *addr)
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
-static u32 flow_get_src(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- __be32 *data = NULL, hdata;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct iphdr,
- saddr),
- 4, &hdata);
- break;
- case htons(ETH_P_IPV6):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct ipv6hdr,
- saddr.s6_addr32[3]),
- 4, &hdata);
- break;
- }
-
- if (data)
- return ntohl(*data);
+ if (flow->src)
+ return ntohl(flow->src);
return addr_fold(skb->sk);
}
-static u32 flow_get_dst(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- __be32 *data = NULL, hdata;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct iphdr,
- daddr),
- 4, &hdata);
- break;
- case htons(ETH_P_IPV6):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct ipv6hdr,
- daddr.s6_addr32[3]),
- 4, &hdata);
- break;
- }
-
- if (data)
- return ntohl(*data);
+ if (flow->dst)
+ return ntohl(flow->dst);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
-static u32 flow_get_proto(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
{
- __u8 *data = NULL, hdata;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct iphdr,
- protocol),
- 1, &hdata);
- break;
- case htons(ETH_P_IPV6):
- data = skb_header_pointer(skb,
- nhoff + offsetof(struct ipv6hdr,
- nexthdr),
- 1, &hdata);
- break;
- }
- if (data)
- return *data;
- return 0;
+ return flow->ip_proto;
}
-/* helper function to get either src or dst port */
-static __be16 *flow_get_proto_common(const struct sk_buff *skb, int nhoff,
- __be16 *_port, int dst)
+static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- __be16 *port = NULL;
- int poff;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- struct iphdr *iph, _iph;
-
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (!iph)
- break;
- if (ip_is_fragment(iph))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0)
- port = skb_header_pointer(skb,
- nhoff + iph->ihl * 4 + poff + dst,
- sizeof(*_port), _port);
- break;
- }
- case htons(ETH_P_IPV6): {
- struct ipv6hdr *iph, _iph;
-
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (!iph)
- break;
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0)
- port = skb_header_pointer(skb,
- nhoff + sizeof(*iph) + poff + dst,
- sizeof(*_port), _port);
- break;
- }
- }
-
- return port;
-}
-
-static u32 flow_get_proto_src(const struct sk_buff *skb, int nhoff)
-{
- __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 0);
-
- if (port)
- return ntohs(*port);
+ if (flow->ports)
+ return ntohs(flow->port16[0]);
return addr_fold(skb->sk);
}
-static u32 flow_get_proto_dst(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 2);
-
- if (port)
- return ntohs(*port);
+ if (flow->ports)
+ return ntohs(flow->port16[1]);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
@@ -239,7 +144,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -248,10 +153,10 @@ static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff)
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
- return flow_get_src(skb, nhoff);
+ return flow_get_src(skb, flow);
}
-static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -260,21 +165,21 @@ static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff)
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
}
fallback:
- return flow_get_dst(skb, nhoff);
+ return flow_get_dst(skb, flow);
}
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
- return flow_get_proto_src(skb, nhoff);
+ return flow_get_proto_src(skb, flow);
}
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, int nhoff)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
- return flow_get_proto_dst(skb, nhoff);
+ return flow_get_proto_dst(skb, flow);
}
static u32 flow_get_rtclassid(const struct sk_buff *skb)
@@ -314,21 +219,19 @@ static u32 flow_get_rxhash(struct sk_buff *skb)
return skb_get_rxhash(skb);
}
-static u32 flow_key_get(struct sk_buff *skb, int key)
+static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
{
- int nhoff = skb_network_offset(skb);
-
switch (key) {
case FLOW_KEY_SRC:
- return flow_get_src(skb, nhoff);
+ return flow_get_src(skb, flow);
case FLOW_KEY_DST:
- return flow_get_dst(skb, nhoff);
+ return flow_get_dst(skb, flow);
case FLOW_KEY_PROTO:
- return flow_get_proto(skb, nhoff);
+ return flow_get_proto(skb, flow);
case FLOW_KEY_PROTO_SRC:
- return flow_get_proto_src(skb, nhoff);
+ return flow_get_proto_src(skb, flow);
case FLOW_KEY_PROTO_DST:
- return flow_get_proto_dst(skb, nhoff);
+ return flow_get_proto_dst(skb, flow);
case FLOW_KEY_IIF:
return flow_get_iif(skb);
case FLOW_KEY_PRIORITY:
@@ -338,13 +241,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
- return flow_get_nfct_src(skb, nhoff);
+ return flow_get_nfct_src(skb, flow);
case FLOW_KEY_NFCT_DST:
- return flow_get_nfct_dst(skb, nhoff);
+ return flow_get_nfct_dst(skb, flow);
case FLOW_KEY_NFCT_PROTO_SRC:
- return flow_get_nfct_proto_src(skb, nhoff);
+ return flow_get_nfct_proto_src(skb, flow);
case FLOW_KEY_NFCT_PROTO_DST:
- return flow_get_nfct_proto_dst(skb, nhoff);
+ return flow_get_nfct_proto_dst(skb, flow);
case FLOW_KEY_RTCLASSID:
return flow_get_rtclassid(skb);
case FLOW_KEY_SKUID:
@@ -361,6 +264,16 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
}
}
+#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
+ (1 << FLOW_KEY_DST) | \
+ (1 << FLOW_KEY_PROTO) | \
+ (1 << FLOW_KEY_PROTO_SRC) | \
+ (1 << FLOW_KEY_PROTO_DST) | \
+ (1 << FLOW_KEY_NFCT_SRC) | \
+ (1 << FLOW_KEY_NFCT_DST) | \
+ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \
+ (1 << FLOW_KEY_NFCT_PROTO_DST))
+
static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
@@ -372,17 +285,20 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
int r;
list_for_each_entry(f, &head->filters, list) {
- u32 keys[f->nkeys];
+ u32 keys[FLOW_KEY_MAX + 1];
+ struct flow_keys flow_keys;
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
keymask = f->keymask;
+ if (keymask & FLOW_KEYS_NEEDED)
+ skb_flow_dissect(skb, &flow_keys);
for (n = 0; n < f->nkeys; n++) {
key = ffs(keymask) - 1;
keymask &= ~(1 << key);
- keys[n] = flow_key_get(skb, key);
+ keys[n] = flow_key_get(skb, key, &flow_keys);
}
if (f->mode == FLOW_MODE_HASH)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index dca6c1a576f7..3d8981fde301 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -618,20 +618,24 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
}
EXPORT_SYMBOL(qdisc_class_hash_remove);
-/* Allocate an unique handle from space managed by kernel */
-
+/* Allocate an unique handle from space managed by kernel
+ * Possible range is [8000-FFFF]:0000 (0x8000 values)
+ */
static u32 qdisc_alloc_handle(struct net_device *dev)
{
- int i = 0x10000;
+ int i = 0x8000;
static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
do {
autohandle += TC_H_MAKE(0x10000U, 0);
if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
autohandle = TC_H_MAKE(0x80000000U, 0);
- } while (qdisc_lookup(dev, autohandle) && --i > 0);
+ if (!qdisc_lookup(dev, autohandle))
+ return autohandle;
+ cond_resched();
+ } while (--i > 0);
- return i > 0 ? autohandle : 0;
+ return 0;
}
void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3422b25df9e4..e465064d39a3 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -19,10 +19,7 @@
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <linux/ipv6.h>
-#include <net/ipv6.h>
+#include <net/flow_keys.h>
/*
CHOKe stateless AQM for fair bandwidth allocation
@@ -60,6 +57,7 @@ struct choke_sched_data {
struct red_parms parms;
/* Variables */
+ struct red_vars vars;
struct tcf_proto *filter_list;
struct {
u32 prob_drop; /* Early probability drops */
@@ -142,85 +140,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
--sch->q.qlen;
}
-/*
- * Compare flow of two packets
- * Returns true only if source and destination address and port match.
- * false for special cases
- */
-static bool choke_match_flow(struct sk_buff *skb1,
- struct sk_buff *skb2)
-{
- int off1, off2, poff;
- const u32 *ports1, *ports2;
- u8 ip_proto;
- __u32 hash1;
-
- if (skb1->protocol != skb2->protocol)
- return false;
-
- /* Use hash value as quick check
- * Assumes that __skb_get_rxhash makes IP header and ports linear
- */
- hash1 = skb_get_rxhash(skb1);
- if (!hash1 || hash1 != skb_get_rxhash(skb2))
- return false;
-
- /* Probably match, but be sure to avoid hash collisions */
- off1 = skb_network_offset(skb1);
- off2 = skb_network_offset(skb2);
-
- switch (skb1->protocol) {
- case __constant_htons(ETH_P_IP): {
- const struct iphdr *ip1, *ip2;
-
- ip1 = (const struct iphdr *) (skb1->data + off1);
- ip2 = (const struct iphdr *) (skb2->data + off2);
-
- ip_proto = ip1->protocol;
- if (ip_proto != ip2->protocol ||
- ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
- return false;
-
- if (ip_is_fragment(ip1) | ip_is_fragment(ip2))
- ip_proto = 0;
- off1 += ip1->ihl * 4;
- off2 += ip2->ihl * 4;
- break;
- }
-
- case __constant_htons(ETH_P_IPV6): {
- const struct ipv6hdr *ip1, *ip2;
-
- ip1 = (const struct ipv6hdr *) (skb1->data + off1);
- ip2 = (const struct ipv6hdr *) (skb2->data + off2);
-
- ip_proto = ip1->nexthdr;
- if (ip_proto != ip2->nexthdr ||
- ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
- ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
- return false;
- off1 += 40;
- off2 += 40;
- }
-
- default: /* Maybe compare MAC header here? */
- return false;
- }
-
- poff = proto_ports_offset(ip_proto);
- if (poff < 0)
- return true;
-
- off1 += poff;
- off2 += poff;
-
- ports1 = (__force u32 *)(skb1->data + off1);
- ports2 = (__force u32 *)(skb2->data + off2);
- return *ports1 == *ports2;
-}
-
struct choke_skb_cb {
- u16 classid;
+ u16 classid;
+ u8 keys_valid;
+ struct flow_keys keys;
};
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
@@ -241,6 +164,32 @@ static u16 choke_get_classid(const struct sk_buff *skb)
}
/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ if (!choke_skb_cb(skb1)->keys_valid) {
+ choke_skb_cb(skb1)->keys_valid = 1;
+ skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
+ }
+
+ if (!choke_skb_cb(skb2)->keys_valid) {
+ choke_skb_cb(skb2)->keys_valid = 1;
+ skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
+ }
+
+ return !memcmp(&choke_skb_cb(skb1)->keys,
+ &choke_skb_cb(skb2)->keys,
+ sizeof(struct flow_keys));
+}
+
+/*
* Classify flow using either:
* 1. pre-existing classification result in skb
* 2. fast internal classification
@@ -317,7 +266,7 @@ static bool choke_match_random(const struct choke_sched_data *q,
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
- struct red_parms *p = &q->parms;
+ const struct red_parms *p = &q->parms;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (q->filter_list) {
@@ -326,14 +275,15 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
goto other_drop; /* Packet was eaten by filter */
}
+ choke_skb_cb(skb)->keys_valid = 0;
/* Compute average queue usage (see RED) */
- p->qavg = red_calc_qavg(p, sch->q.qlen);
- if (red_is_idling(p))
- red_end_of_idle_period(p);
+ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
/* Is queue small? */
- if (p->qavg <= p->qth_min)
- p->qcount = -1;
+ if (q->vars.qavg <= p->qth_min)
+ q->vars.qcount = -1;
else {
unsigned int idx;
@@ -345,8 +295,8 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
/* Queue is large, always mark/drop */
- if (p->qavg > p->qth_max) {
- p->qcount = -1;
+ if (q->vars.qavg > p->qth_max) {
+ q->vars.qcount = -1;
sch->qstats.overlimits++;
if (use_harddrop(q) || !use_ecn(q) ||
@@ -356,10 +306,10 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q->stats.forced_mark++;
- } else if (++p->qcount) {
- if (red_mark_probability(p, p->qavg)) {
- p->qcount = 0;
- p->qR = red_random(p);
+ } else if (++q->vars.qcount) {
+ if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+ q->vars.qcount = 0;
+ q->vars.qR = red_random(p);
sch->qstats.overlimits++;
if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
@@ -370,7 +320,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->stats.prob_mark++;
}
} else
- p->qR = red_random(p);
+ q->vars.qR = red_random(p);
}
/* Admit new packet */
@@ -404,8 +354,8 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
if (q->head == q->tail) {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
return NULL;
}
@@ -428,8 +378,8 @@ static unsigned int choke_drop(struct Qdisc *sch)
if (len > 0)
q->stats.other++;
else {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
}
return len;
@@ -439,12 +389,13 @@ static void choke_reset(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
- red_restart(&q->parms);
+ red_restart(&q->vars);
}
static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
[TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
};
@@ -466,6 +417,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
int err;
struct sk_buff **old = NULL;
unsigned int mask;
+ u32 max_P;
if (opt == NULL)
return -EINVAL;
@@ -478,6 +430,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
tb[TCA_CHOKE_STAB] == NULL)
return -EINVAL;
+ max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
if (ctl->limit > CHOKE_MAX_QUEUE)
@@ -527,10 +481,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
- nla_data(tb[TCA_CHOKE_STAB]));
+ nla_data(tb[TCA_CHOKE_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
if (q->head == q->tail)
- red_end_of_idle_period(&q->parms);
+ red_end_of_idle_period(&q->vars);
sch_tree_unlock(sch);
choke_free(old);
@@ -561,6 +517,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+ NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P);
return nla_nest_end(skb, opts);
nla_put_failure:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69fca2798804..67fc573e013a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -60,7 +60,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
/* check the reason of requeuing without tx lock first */
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
- if (!netif_tx_queue_frozen_or_stopped(txq)) {
+ if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
q->q.qlen--;
} else
@@ -121,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
- if (!netif_tx_queue_frozen_or_stopped(txq))
+ if (!netif_xmit_frozen_or_stopped(txq))
ret = dev_hard_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
@@ -143,7 +143,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
ret = dev_requeue_skb(skb, q);
}
- if (ret && netif_tx_queue_frozen_or_stopped(txq))
+ if (ret && netif_xmit_frozen_or_stopped(txq))
ret = 0;
return ret;
@@ -242,10 +242,11 @@ static void dev_watchdog(unsigned long arg)
* old device drivers set dev->trans_start
*/
trans_start = txq->trans_start ? : dev->trans_start;
- if (netif_tx_queue_stopped(txq) &&
+ if (netif_xmit_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
+ txq->trans_timeout++;
break;
}
}
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 6cd8ddfb512d..0b15236be7b6 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -34,13 +34,14 @@ struct gred_sched;
struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
- u32 DP; /* the drop pramaters */
+ u32 DP; /* the drop parameters */
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
};
@@ -55,7 +56,7 @@ struct gred_sched {
u32 red_flags;
u32 DPs;
u32 def;
- struct red_parms wred_set;
+ struct red_vars wred_set;
};
static inline int gred_wred_mode(struct gred_sched *table)
@@ -125,17 +126,17 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb)
return skb->tc_index & GRED_VQ_MASK;
}
-static inline void gred_load_wred_set(struct gred_sched *table,
+static inline void gred_load_wred_set(const struct gred_sched *table,
struct gred_sched_data *q)
{
- q->parms.qavg = table->wred_set.qavg;
- q->parms.qidlestart = table->wred_set.qidlestart;
+ q->vars.qavg = table->wred_set.qavg;
+ q->vars.qidlestart = table->wred_set.qidlestart;
}
static inline void gred_store_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
- table->wred_set.qavg = q->parms.qavg;
+ table->wred_set.qavg = q->vars.qavg;
}
static inline int gred_use_ecn(struct gred_sched *t)
@@ -170,7 +171,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
goto drop;
}
- /* fix tc_index? --could be controvesial but needed for
+ /* fix tc_index? --could be controversial but needed for
requeueing */
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
@@ -181,8 +182,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
- !red_is_idling(&t->tab[i]->parms))
- qavg += t->tab[i]->parms.qavg;
+ !red_is_idling(&t->tab[i]->vars))
+ qavg += t->tab[i]->vars.qavg;
}
}
@@ -193,15 +194,17 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (gred_wred_mode(t))
gred_load_wred_set(t, q);
- q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ gred_backlog(t, q, sch));
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
if (gred_wred_mode(t))
gred_store_wred_set(t, q);
- switch (red_action(&q->parms, q->parms.qavg + qavg)) {
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
case RED_DONT_MARK:
break;
@@ -260,7 +263,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch)
q->backlog -= qdisc_pkt_len(skb);
if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
}
return skb;
@@ -293,7 +296,7 @@ static unsigned int gred_drop(struct Qdisc *sch)
q->stats.other++;
if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
}
qdisc_drop(skb, sch);
@@ -320,7 +323,7 @@ static void gred_reset(struct Qdisc *sch)
if (!q)
continue;
- red_restart(&q->parms);
+ red_restart(&q->vars);
q->backlog = 0;
}
}
@@ -379,29 +382,31 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
- struct tc_gred_qopt *ctl, int prio, u8 *stab)
+ struct tc_gred_qopt *ctl, int prio,
+ u8 *stab, u32 max_P,
+ struct gred_sched_data **prealloc)
{
struct gred_sched *table = qdisc_priv(sch);
- struct gred_sched_data *q;
+ struct gred_sched_data *q = table->tab[dp];
- if (table->tab[dp] == NULL) {
- table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC);
- if (table->tab[dp] == NULL)
+ if (!q) {
+ table->tab[dp] = q = *prealloc;
+ *prealloc = NULL;
+ if (!q)
return -ENOMEM;
}
- q = table->tab[dp];
q->DP = dp;
q->prio = prio;
q->limit = ctl->limit;
if (q->backlog == 0)
- red_end_of_idle_period(&q->parms);
+ red_end_of_idle_period(&q->vars);
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
- ctl->Scell_log, stab);
-
+ ctl->Scell_log, stab, max_P);
+ red_set_vars(&q->vars);
return 0;
}
@@ -409,6 +414,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
+ [TCA_GRED_MAX_P] = { .type = NLA_U32 },
};
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
@@ -418,6 +424,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
struct nlattr *tb[TCA_GRED_MAX + 1];
int err, prio = GRED_DEF_PRIO;
u8 *stab;
+ u32 max_P;
+ struct gred_sched_data *prealloc;
if (opt == NULL)
return -EINVAL;
@@ -433,6 +441,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
tb[TCA_GRED_STAB] == NULL)
return -EINVAL;
+ max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+
err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
@@ -455,9 +465,10 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
prio = ctl->prio;
}
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
sch_tree_lock(sch);
- err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
if (err < 0)
goto errout_locked;
@@ -471,6 +482,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
errout_locked:
sch_tree_unlock(sch);
+ kfree(prealloc);
errout:
return err;
}
@@ -498,6 +510,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
struct gred_sched *table = qdisc_priv(sch);
struct nlattr *parms, *opts = NULL;
int i;
+ u32 max_p[MAX_DPs];
struct tc_gred_sopt sopt = {
.DPs = table->DPs,
.def_DP = table->def,
@@ -509,6 +522,14 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
if (opts == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ max_p[i] = q ? q->parms.max_P : 0;
+ }
+ NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p);
+
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
@@ -545,12 +566,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.bytesin = q->bytesin;
if (gred_wred_mode(table)) {
- q->parms.qidlestart =
- table->tab[table->def]->parms.qidlestart;
- q->parms.qavg = table->tab[table->def]->parms.qavg;
+ q->vars.qidlestart =
+ table->tab[table->def]->vars.qidlestart;
+ q->vars.qavg = table->tab[table->def]->vars.qavg;
}
- opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+ opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg);
append_opt:
if (nla_append(skb, sizeof(opt), &opt) < 0)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 6488e6425652..9bdca2e011e9 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1368,6 +1368,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct tc_hfsc_stats xstats;
cl->qstats.qlen = cl->qdisc->q.qlen;
+ cl->qstats.backlog = cl->qdisc->qstats.backlog;
xstats.level = cl->level;
xstats.period = cl->cl_vtperiod;
xstats.work = cl->cl_total;
@@ -1561,6 +1562,15 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
struct hfsc_sched *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
+ struct hfsc_class *cl;
+ struct hlist_node *n;
+ unsigned int i;
+
+ sch->qstats.backlog = 0;
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+ sch->qstats.backlog += cl->qdisc->qstats.backlog;
+ }
qopt.defcls = q->defcls;
NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index edc1950e0e77..49131d7a7446 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -107,7 +107,8 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
@@ -138,7 +139,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), curband))) {
qdisc = q->queues[curband];
skb = qdisc->ops->peek(qdisc);
if (skb)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index a4ab207cdc59..e7e1d0b57b3d 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -22,6 +22,7 @@
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -66,7 +67,11 @@
*/
struct netem_sched_data {
+ /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
+
+ /* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
+
struct qdisc_watchdog watchdog;
psched_tdiff_t latency;
@@ -79,6 +84,11 @@ struct netem_sched_data {
u32 duplicate;
u32 reorder;
u32 corrupt;
+ u32 rate;
+ s32 packet_overhead;
+ u32 cell_size;
+ u32 cell_size_reciprocal;
+ s32 cell_overhead;
struct crndstate {
u32 last;
@@ -111,7 +121,9 @@ struct netem_sched_data {
};
-/* Time stamp put into socket buffer control block */
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ */
struct netem_skb_cb {
psched_time_t time_to_send;
};
@@ -298,6 +310,51 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
+static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
+{
+ u64 ticks;
+
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+ if (len > cells * q->cell_size) /* extra cell needed for remainder */
+ cells++;
+ len = cells * (q->cell_size + q->cell_overhead);
+ }
+
+ ticks = (u64)len * NSEC_PER_SEC;
+
+ do_div(ticks, q->rate);
+ return PSCHED_NS2TICKS(ticks);
+}
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct sk_buff_head *list = &sch->q;
+ psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+ struct sk_buff *skb;
+
+ if (likely(skb_queue_len(list) < sch->limit)) {
+ skb = skb_peek_tail(list);
+ /* Optimize for add at tail */
+ if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
+ return qdisc_enqueue_tail(nskb, sch);
+
+ skb_queue_reverse_walk(list, skb) {
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ break;
+ }
+
+ __skb_queue_after(list, skb, nskb);
+ sch->qstats.backlog += qdisc_pkt_len(nskb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_reshape_fail(nskb, sch);
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -371,9 +428,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
&q->delay_cor, q->delay_dist);
now = psched_get_time();
+
+ if (q->rate) {
+ struct sk_buff_head *list = &sch->q;
+
+ delay += packet_len_2_sched_time(skb->len, q);
+
+ if (!skb_queue_empty(list)) {
+ /*
+ * Last packet in queue is reference point (now).
+ * First packet in queue is already in flight,
+ * calculate this time bonus and substract
+ * from delay.
+ */
+ delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
+ now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
+ }
+ }
+
cb->time_to_send = now + delay;
++q->counter;
- ret = qdisc_enqueue(skb, q->qdisc);
+ ret = tfifo_enqueue(skb, sch);
} else {
/*
* Do re-ordering by putting one out of N packets at the front
@@ -382,9 +457,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&q->qdisc->q, skb);
- q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
- q->qdisc->qstats.requeues++;
+ __skb_queue_head(&sch->q, skb);
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ sch->qstats.requeues++;
ret = NET_XMIT_SUCCESS;
}
@@ -395,19 +470,20 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
- sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
static unsigned int netem_drop(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned int len = 0;
+ unsigned int len;
- if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
- sch->q.qlen--;
+ len = qdisc_queue_drop(sch);
+ if (!len && q->qdisc && q->qdisc->ops->drop)
+ len = q->qdisc->ops->drop(q->qdisc);
+ if (len)
sch->qstats.drops++;
- }
+
return len;
}
@@ -419,16 +495,16 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
if (qdisc_is_throttled(sch))
return NULL;
- skb = q->qdisc->ops->peek(q->qdisc);
+tfifo_dequeue:
+ skb = qdisc_peek_head(sch);
if (skb) {
const struct netem_skb_cb *cb = netem_skb_cb(skb);
- psched_time_t now = psched_get_time();
/* if more time remaining? */
- if (cb->time_to_send <= now) {
- skb = qdisc_dequeue_peeked(q->qdisc);
+ if (cb->time_to_send <= psched_get_time()) {
+ skb = qdisc_dequeue_tail(sch);
if (unlikely(!skb))
- return NULL;
+ goto qdisc_dequeue;
#ifdef CONFIG_NET_CLS_ACT
/*
@@ -439,15 +515,37 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
skb->tstamp.tv64 = 0;
#endif
- sch->q.qlen--;
+ if (q->qdisc) {
+ int err = qdisc_enqueue(skb, q->qdisc);
+
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ sch->qstats.drops++;
+ qdisc_tree_decrease_qlen(sch, 1);
+ }
+ }
+ goto tfifo_dequeue;
+ }
+deliver:
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
return skb;
}
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
}
+qdisc_dequeue:
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
return NULL;
}
@@ -455,8 +553,9 @@ static void netem_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- qdisc_reset(q->qdisc);
- sch->q.qlen = 0;
+ qdisc_reset_queue(sch);
+ if (q->qdisc)
+ qdisc_reset(q->qdisc);
qdisc_watchdog_cancel(&q->watchdog);
}
@@ -536,6 +635,19 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
init_crandom(&q->corrupt_cor, r->correlation);
}
+static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct tc_netem_rate *r = nla_data(attr);
+
+ q->rate = r->rate;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ if (q->cell_size)
+ q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+ q->cell_overhead = r->cell_overhead;
+}
+
static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -549,7 +661,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
case NETEM_LOSS_GI: {
const struct tc_netem_gimodel *gi = nla_data(la);
- if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
+ if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
pr_info("netem: incorrect gi model size\n");
return -EINVAL;
}
@@ -568,8 +680,8 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
case NETEM_LOSS_GE: {
const struct tc_netem_gemodel *ge = nla_data(la);
- if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
- pr_info("netem: incorrect gi model size\n");
+ if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect ge model size\n");
return -EINVAL;
}
@@ -595,6 +707,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
};
@@ -632,11 +745,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (ret < 0)
return ret;
- ret = fifo_set_limit(q->qdisc, qopt->limit);
- if (ret) {
- pr_info("netem: can't set fifo limit\n");
- return ret;
- }
+ sch->limit = qopt->limit;
q->latency = qopt->latency;
q->jitter = qopt->jitter;
@@ -667,6 +776,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
+ if (tb[TCA_NETEM_RATE])
+ get_rate(sch, tb[TCA_NETEM_RATE]);
+
q->loss_model = CLG_RANDOM;
if (tb[TCA_NETEM_LOSS])
ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
@@ -674,88 +786,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
return ret;
}
-/*
- * Special case version of FIFO queue for use by netem.
- * It queues in order based on timestamps in skb's
- */
-struct fifo_sched_data {
- u32 limit;
- psched_time_t oldest;
-};
-
-static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct sk_buff_head *list = &sch->q;
- psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
- struct sk_buff *skb;
-
- if (likely(skb_queue_len(list) < q->limit)) {
- /* Optimize for add at tail */
- if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
- q->oldest = tnext;
- return qdisc_enqueue_tail(nskb, sch);
- }
-
- skb_queue_reverse_walk(list, skb) {
- const struct netem_skb_cb *cb = netem_skb_cb(skb);
-
- if (tnext >= cb->time_to_send)
- break;
- }
-
- __skb_queue_after(list, skb, nskb);
-
- sch->qstats.backlog += qdisc_pkt_len(nskb);
-
- return NET_XMIT_SUCCESS;
- }
-
- return qdisc_reshape_fail(nskb, sch);
-}
-
-static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (opt) {
- struct tc_fifo_qopt *ctl = nla_data(opt);
- if (nla_len(opt) < sizeof(*ctl))
- return -EINVAL;
-
- q->limit = ctl->limit;
- } else
- q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
-
- q->oldest = PSCHED_PASTPERFECT;
- return 0;
-}
-
-static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct tc_fifo_qopt opt = { .limit = q->limit };
-
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
- return skb->len;
-
-nla_put_failure:
- return -1;
-}
-
-static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
- .id = "tfifo",
- .priv_size = sizeof(struct fifo_sched_data),
- .enqueue = tfifo_enqueue,
- .dequeue = qdisc_dequeue_head,
- .peek = qdisc_peek_head,
- .drop = qdisc_queue_drop,
- .init = tfifo_init,
- .reset = qdisc_reset_queue,
- .change = tfifo_init,
- .dump = tfifo_dump,
-};
-
static int netem_init(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -767,18 +797,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
q->loss_model = CLG_RANDOM;
- q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
- TC_H_MAKE(sch->handle, 1));
- if (!q->qdisc) {
- pr_notice("netem: qdisc create tfifo qdisc failed\n");
- return -ENOMEM;
- }
-
ret = netem_change(sch, opt);
- if (ret) {
+ if (ret)
pr_info("netem: change failed\n");
- qdisc_destroy(q->qdisc);
- }
return ret;
}
@@ -787,7 +808,8 @@ static void netem_destroy(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
- qdisc_destroy(q->qdisc);
+ if (q->qdisc)
+ qdisc_destroy(q->qdisc);
dist_free(q->delay_dist);
}
@@ -847,6 +869,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
+ struct tc_netem_rate rate;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -869,6 +892,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
corrupt.correlation = q->corrupt_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+ rate.rate = q->rate;
+ rate.packet_overhead = q->packet_overhead;
+ rate.cell_size = q->cell_size;
+ rate.cell_overhead = q->cell_overhead;
+ NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
+
if (dump_loss_model(q, skb) != 0)
goto nla_put_failure;
@@ -884,7 +913,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
{
struct netem_sched_data *q = qdisc_priv(sch);
- if (cl != 1) /* only one class */
+ if (cl != 1 || !q->qdisc) /* only one class */
return -ENOENT;
tcm->tcm_handle |= TC_H_MIN(1);
@@ -898,14 +927,13 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
{
struct netem_sched_data *q = qdisc_priv(sch);
- if (new == NULL)
- new = &noop_qdisc;
-
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
+ if (*old) {
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ }
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 7b0325459e71..e68cb440756a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -211,6 +211,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct nlattr *tb[TCA_QFQ_MAX + 1];
u32 weight, lmax, inv_w;
int i, err;
+ int delta_w;
if (tca[TCA_OPTIONS] == NULL) {
pr_notice("qfq: no options\n");
@@ -232,9 +233,10 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
inv_w = ONE_FP / weight;
weight = ONE_FP / inv_w;
- if (q->wsum + weight > QFQ_MAX_WSUM) {
+ delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0);
+ if (q->wsum + delta_w > QFQ_MAX_WSUM) {
pr_notice("qfq: total weight out of range (%u + %u)\n",
- weight, q->wsum);
+ delta_w, q->wsum);
return -EINVAL;
}
@@ -256,13 +258,12 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return err;
}
- sch_tree_lock(sch);
- if (tb[TCA_QFQ_WEIGHT]) {
- q->wsum = weight - ONE_FP / cl->inv_w;
+ if (inv_w != cl->inv_w) {
+ sch_tree_lock(sch);
+ q->wsum += delta_w;
cl->inv_w = inv_w;
+ sch_tree_unlock(sch);
}
- sch_tree_unlock(sch);
-
return 0;
}
@@ -277,7 +278,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
i = qfq_calc_index(cl->inv_w, cl->lmax);
cl->grp = &q->groups[i];
- q->wsum += weight;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
@@ -294,6 +294,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return err;
}
}
+ q->wsum += weight;
sch_tree_lock(sch);
qdisc_class_hash_insert(&q->clhash, &cl->common);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d617161f8dd3..a5cc3012cf42 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -39,7 +39,9 @@
struct red_sched_data {
u32 limit; /* HARD maximal queue length */
unsigned char flags;
+ struct timer_list adapt_timer;
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
};
@@ -60,12 +62,14 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
struct Qdisc *child = q->qdisc;
int ret;
- q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog);
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ child->qstats.backlog);
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
- switch (red_action(&q->parms, q->parms.qavg)) {
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
case RED_DONT_MARK:
break;
@@ -116,8 +120,8 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
} else {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
}
return skb;
}
@@ -143,8 +147,8 @@ static unsigned int red_drop(struct Qdisc *sch)
return len;
}
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
return 0;
}
@@ -155,18 +159,21 @@ static void red_reset(struct Qdisc *sch)
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
- red_restart(&q->parms);
+ red_restart(&q->vars);
}
static void red_destroy(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
+
+ del_timer_sync(&q->adapt_timer);
qdisc_destroy(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_RED_MAX_P] = { .type = NLA_U32 },
};
static int red_change(struct Qdisc *sch, struct nlattr *opt)
@@ -176,6 +183,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
struct tc_red_qopt *ctl;
struct Qdisc *child = NULL;
int err;
+ u32 max_P;
if (opt == NULL)
return -EINVAL;
@@ -188,6 +196,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
+ max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+
ctl = nla_data(tb[TCA_RED_PARMS]);
if (ctl->limit > 0) {
@@ -205,22 +215,42 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
q->qdisc = child;
}
- red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
- ctl->Plog, ctl->Scell_log,
- nla_data(tb[TCA_RED_STAB]));
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_RED_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
+
+ del_timer(&q->adapt_timer);
+ if (ctl->flags & TC_RED_ADAPTATIVE)
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
if (!q->qdisc->q.qlen)
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
return 0;
}
+static inline void red_adaptative_timer(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct red_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ red_adaptative_algo(&q->parms, &q->vars);
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+ spin_unlock(root_lock);
+}
+
static int red_init(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
q->qdisc = &noop_qdisc;
+ setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
return red_change(sch, opt);
}
@@ -243,6 +273,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
if (opts == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+ NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P);
return nla_nest_end(skb, opts);
nla_put_failure:
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index e83c272c0325..96e42cae4c7a 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -26,6 +26,7 @@
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
+#include <net/flow_keys.h>
/*
* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
@@ -286,6 +287,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
u32 minqlen = ~0;
u32 r, slot, salt, sfbhash;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ struct flow_keys keys;
if (unlikely(sch->q.qlen >= q->limit)) {
sch->qstats.overlimits++;
@@ -309,13 +311,19 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, q, &ret, &salt))
goto other_drop;
+ keys.src = salt;
+ keys.dst = 0;
+ keys.ports = 0;
} else {
- salt = skb_get_rxhash(skb);
+ skb_flow_dissect(skb, &keys);
}
slot = q->slot;
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -347,7 +355,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (unlikely(p_min >= SFB_MAX_PROB)) {
/* Inelastic flow */
if (q->double_buffering) {
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 4f5510e2bd6f..0a7964009e8c 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -17,14 +17,13 @@
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
/* Stochastic Fairness Queuing algorithm.
@@ -67,16 +66,18 @@
SFQ is superior for this purpose.
IMPLEMENTATION:
- This implementation limits maximal queue length to 128;
- max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024.
- The only goal of this restrictions was that all data
- fit into one 4K page on 32bit arches.
+ This implementation limits :
+ - maximal queue length per flow to 127 packets.
+ - max mtu to 2^18-1;
+ - max 65408 flows,
+ - number of hash buckets to 65536.
It is easy to increase these values, but not in flight. */
-#define SFQ_DEPTH 128 /* max number of packets per flow */
-#define SFQ_SLOTS 128 /* max number of flows */
-#define SFQ_EMPTY_SLOT 255
+#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
+#define SFQ_DEFAULT_FLOWS 128
+#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
+#define SFQ_EMPTY_SLOT 0xffff
#define SFQ_DEFAULT_HASH_DIVISOR 1024
/* We use 16 bits to store allot, and want to handle packets up to 64K
@@ -85,13 +86,13 @@
#define SFQ_ALLOT_SHIFT 3
#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
-/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
-typedef unsigned char sfq_index;
+/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
+typedef u16 sfq_index;
/*
* We dont use pointers to save space.
- * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
- * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
+ * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
* are 'pointers' to dep[] array
*/
struct sfq_head {
@@ -103,28 +104,38 @@ struct sfq_slot {
struct sk_buff *skblist_next;
struct sk_buff *skblist_prev;
sfq_index qlen; /* number of skbs in skblist */
- sfq_index next; /* next slot in sfq chain */
+ sfq_index next; /* next slot in sfq RR chain */
struct sfq_head dep; /* anchor in dep[] chains */
unsigned short hash; /* hash value (index in ht[]) */
short allot; /* credit for this slot */
};
struct sfq_sched_data {
-/* Parameters */
- int perturb_period;
- unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
- int limit;
+/* frequently used fields */
+ int limit; /* limit of total number of packets in this qdisc */
unsigned int divisor; /* number of slots in hash table */
-/* Variables */
- struct tcf_proto *filter_list;
- struct timer_list perturb_timer;
+ unsigned int maxflows; /* number of flows in flows array */
+ int headdrop;
+ int maxdepth; /* limit of packets per flow */
+
u32 perturbation;
+ struct tcf_proto *filter_list;
sfq_index cur_depth; /* depth of longest slot */
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct sfq_slot *tail; /* current slot in round */
- sfq_index *ht; /* Hash table (divisor slots) */
- struct sfq_slot slots[SFQ_SLOTS];
- struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
+ sfq_index *ht; /* Hash table ('divisor' slots) */
+ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
+
+ struct sfq_head dep[SFQ_MAX_DEPTH + 1];
+ /* Linked lists of slots, indexed by depth
+ * dep[0] : list of unused flows
+ * dep[1] : list of flows with 1 packet
+ * dep[X] : list of flows with X packets
+ */
+
+ int perturb_period;
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
+ struct timer_list perturb_timer;
};
/*
@@ -132,66 +143,36 @@ struct sfq_sched_data {
*/
static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
{
- if (val < SFQ_SLOTS)
+ if (val < SFQ_MAX_FLOWS)
return &q->slots[val].dep;
- return &q->dep[val - SFQ_SLOTS];
+ return &q->dep[val - SFQ_MAX_FLOWS];
}
-static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+/*
+ * In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
+struct sfq_skb_cb {
+ struct flow_keys keys;
+};
+
+static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
{
- return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
+ BUILD_BUG_ON(sizeof(skb->cb) <
+ sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
+ return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
}
-static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+static unsigned int sfq_hash(const struct sfq_sched_data *q,
+ const struct sk_buff *skb)
{
- u32 h, h2;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- {
- const struct iphdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- goto err;
- iph = ip_hdr(skb);
- h = (__force u32)iph->daddr;
- h2 = (__force u32)iph->saddr ^ iph->protocol;
- if (ip_is_fragment(iph))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
- iph = ip_hdr(skb);
- h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
- }
- break;
- }
- case htons(ETH_P_IPV6):
- {
- const struct ipv6hdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- goto err;
- iph = ipv6_hdr(skb);
- h = (__force u32)iph->daddr.s6_addr32[3];
- h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
- iph = ipv6_hdr(skb);
- h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
- }
- break;
- }
- default:
-err:
- h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
- h2 = (unsigned long)skb->sk;
- }
+ const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
+ unsigned int hash;
- return sfq_fold_hash(q, h, h2);
+ hash = jhash_3words((__force u32)keys->dst,
+ (__force u32)keys->src ^ keys->ip_proto,
+ (__force u32)keys->ports, q->perturbation);
+ return hash & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -206,8 +187,10 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
- if (!q->filter_list)
+ if (!q->filter_list) {
+ skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
return sfq_hash(q, skb) + 1;
+ }
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, q->filter_list, &res);
@@ -228,18 +211,19 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
}
/*
- * x : slot number [0 .. SFQ_SLOTS - 1]
+ * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
*/
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
- int qlen = q->slots[x].qlen;
+ struct sfq_slot *slot = &q->slots[x];
+ int qlen = slot->qlen;
- p = qlen + SFQ_SLOTS;
+ p = qlen + SFQ_MAX_FLOWS;
n = q->dep[qlen].next;
- q->slots[x].dep.next = n;
- q->slots[x].dep.prev = p;
+ slot->dep.next = n;
+ slot->dep.prev = p;
q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
sfq_dep_head(q, n)->prev = x;
@@ -304,6 +288,7 @@ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
static inline void slot_queue_init(struct sfq_slot *slot)
{
+ memset(slot, 0, sizeof(*slot));
slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
}
@@ -334,7 +319,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
x = q->dep[d].next;
slot = &q->slots[x];
drop:
- skb = slot_dequeue_tail(slot);
+ skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
sfq_dec(q, x);
kfree_skb(skb);
@@ -378,16 +363,27 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
slot = &q->slots[x];
if (x == SFQ_EMPTY_SLOT) {
x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS)
+ return qdisc_drop(skb, sch);
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
}
- /* If selected queue has length q->limit, do simple tail drop,
- * i.e. drop _this_ packet.
- */
- if (slot->qlen >= q->limit)
- return qdisc_drop(skb, sch);
+ if (slot->qlen >= q->maxdepth) {
+ struct sk_buff *head;
+
+ if (!q->headdrop)
+ return qdisc_drop(skb, sch);
+
+ head = slot_dequeue_head(slot);
+ sch->qstats.backlog -= qdisc_pkt_len(head);
+ qdisc_drop(head, sch);
+
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ slot_queue_add(slot, skb);
+ return NET_XMIT_CN;
+ }
sch->qstats.backlog += qdisc_pkt_len(skb);
slot_queue_add(slot, skb);
@@ -395,11 +391,11 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
slot->next = x;
+ q->tail = slot;
} else {
slot->next = q->tail->next;
q->tail->next = x;
}
- q->tail = slot;
slot->allot = q->scaled_quantum;
}
if (++sch->q.qlen <= q->limit)
@@ -468,12 +464,83 @@ sfq_reset(struct Qdisc *sch)
kfree_skb(skb);
}
+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int i;
+ struct sfq_slot *slot;
+ struct sk_buff_head list;
+ int dropped = 0;
+
+ __skb_queue_head_init(&list);
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot = &q->slots[i];
+ if (!slot->qlen)
+ continue;
+ while (slot->qlen) {
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, i);
+ __skb_queue_tail(&list, skb);
+ }
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ }
+ q->tail = NULL;
+
+ while ((skb = __skb_dequeue(&list)) != NULL) {
+ unsigned int hash = sfq_hash(q, skb);
+ sfq_index x = q->ht[hash];
+
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS) {
+drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ dropped++;
+ continue;
+ }
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ }
+ if (slot->qlen >= q->maxdepth)
+ goto drop;
+ slot_queue_add(slot, skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ q->tail = slot;
+ slot->allot = q->scaled_quantum;
+ }
+ }
+ sch->q.qlen -= dropped;
+ qdisc_tree_decrease_qlen(sch, dropped);
+}
+
static void sfq_perturbation(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct sfq_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
q->perturbation = net_random();
+ if (!q->filter_list && q->tail)
+ sfq_rehash(sch);
+ spin_unlock(root_lock);
if (q->perturb_period)
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
@@ -483,23 +550,39 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
+ struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
unsigned int qlen;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
-
+ if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
+ ctl_v1 = nla_data(opt);
if (ctl->divisor &&
(!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
return -EINVAL;
sch_tree_lock(sch);
- q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
- q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ if (ctl->quantum) {
+ q->quantum = ctl->quantum;
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ }
q->perturb_period = ctl->perturb_period * HZ;
- if (ctl->limit)
- q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
- if (ctl->divisor)
+ if (ctl->flows)
+ q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ if (ctl->divisor) {
q->divisor = ctl->divisor;
+ q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ }
+ if (ctl_v1) {
+ if (ctl_v1->depth)
+ q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ q->headdrop = ctl_v1->headdrop;
+ }
+ if (ctl->limit) {
+ q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+ q->maxflows = min_t(u32, q->maxflows, q->limit);
+ }
+
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
@@ -514,46 +597,77 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
+static void *sfq_alloc(size_t sz)
+{
+ void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vmalloc(sz);
+ return ptr;
+}
+
+static void sfq_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ q->perturb_period = 0;
+ del_timer_sync(&q->perturb_timer);
+ sfq_free(q->ht);
+ sfq_free(q->slots);
+}
+
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- size_t sz;
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
- for (i = 0; i < SFQ_DEPTH; i++) {
- q->dep[i].next = i + SFQ_SLOTS;
- q->dep[i].prev = i + SFQ_SLOTS;
+ for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
+ q->dep[i].next = i + SFQ_MAX_FLOWS;
+ q->dep[i].prev = i + SFQ_MAX_FLOWS;
}
- q->limit = SFQ_DEPTH - 1;
+ q->limit = SFQ_MAX_DEPTH;
+ q->maxdepth = SFQ_MAX_DEPTH;
q->cur_depth = 0;
q->tail = NULL;
q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
- if (opt == NULL) {
- q->quantum = psched_mtu(qdisc_dev(sch));
- q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
- q->perturb_period = 0;
- q->perturbation = net_random();
- } else {
+ q->maxflows = SFQ_DEFAULT_FLOWS;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ q->perturb_period = 0;
+ q->perturbation = net_random();
+
+ if (opt) {
int err = sfq_change(sch, opt);
if (err)
return err;
}
- sz = sizeof(q->ht[0]) * q->divisor;
- q->ht = kmalloc(sz, GFP_KERNEL);
- if (!q->ht && sz > PAGE_SIZE)
- q->ht = vmalloc(sz);
- if (!q->ht)
+ q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
+ q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
+ if (!q->ht || !q->slots) {
+ sfq_destroy(sch);
return -ENOMEM;
+ }
for (i = 0; i < q->divisor; i++)
q->ht[i] = SFQ_EMPTY_SLOT;
- for (i = 0; i < SFQ_SLOTS; i++) {
+ for (i = 0; i < q->maxflows; i++) {
slot_queue_init(&q->slots[i]);
sfq_link(q, i);
}
@@ -564,31 +678,20 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
-static void sfq_destroy(struct Qdisc *sch)
-{
- struct sfq_sched_data *q = qdisc_priv(sch);
-
- tcf_destroy_chain(&q->filter_list);
- q->perturb_period = 0;
- del_timer_sync(&q->perturb_timer);
- if (is_vmalloc_addr(q->ht))
- vfree(q->ht);
- else
- kfree(q->ht);
-}
-
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
- struct tc_sfq_qopt opt;
-
- opt.quantum = q->quantum;
- opt.perturb_period = q->perturb_period / HZ;
-
- opt.limit = q->limit;
- opt.divisor = q->divisor;
- opt.flows = q->limit;
+ struct tc_sfq_qopt_v1 opt;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.v0.quantum = q->quantum;
+ opt.v0.perturb_period = q->perturb_period / HZ;
+ opt.v0.limit = q->limit;
+ opt.v0.divisor = q->divisor;
+ opt.v0.flows = q->maxflows;
+ opt.depth = q->maxdepth;
+ opt.headdrop = q->headdrop;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 1dcfb5223a86..b8e156319d7b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -346,6 +346,7 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 4f4c52c0eeb3..45326599fda3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -277,7 +277,7 @@ static inline int teql_resolve(struct sk_buff *skb,
return 0;
rcu_read_lock();
- mn = dst_get_neighbour(dst);
+ mn = dst_get_neighbour_noref(dst);
res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
rcu_read_unlock();
@@ -310,7 +310,7 @@ restart:
if (slave_txq->qdisc_sleeping != q)
continue;
- if (__netif_subqueue_stopped(slave, subq) ||
+ if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
!netif_running(slave)) {
busy = 1;
continue;
@@ -321,7 +321,7 @@ restart:
if (__netif_tx_trylock(slave_txq)) {
unsigned int length = qdisc_pkt_len(skb);
- if (!netif_tx_queue_frozen_or_stopped(slave_txq) &&
+ if (!netif_xmit_frozen_or_stopped(slave_txq) &&
slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
txq_trans_update(slave_txq);
__netif_tx_unlock(slave_txq);
@@ -333,7 +333,7 @@ restart:
}
__netif_tx_unlock(slave_txq);
}
- if (netif_queue_stopped(dev))
+ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
busy = 1;
break;
case 1: