summaryrefslogtreecommitdiffstats
path: root/net/sched/sch_fq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 16:27:18 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 16:27:18 -0700
commitcb8e59cc87201af93dfbb6c3dccc8fcad72a09c2 (patch)
treea334db9022f89654b777bbce8c4c6632e65b9031 /net/sched/sch_fq.c
parent2e63f6ce7ed2c4ff83ba30ad9ccad422289a6c63 (diff)
parent065fcfd49763ec71ae345bb5c5a74f961031e70e (diff)
downloadlinux-cb8e59cc87201af93dfbb6c3dccc8fcad72a09c2.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller: 1) Allow setting bluetooth L2CAP modes via socket option, from Luiz Augusto von Dentz. 2) Add GSO partial support to igc, from Sasha Neftin. 3) Several cleanups and improvements to r8169 from Heiner Kallweit. 4) Add IF_OPER_TESTING link state and use it when ethtool triggers a device self-test. From Andrew Lunn. 5) Start moving away from custom driver versions, use the globally defined kernel version instead, from Leon Romanovsky. 6) Support GRO vis gro_cells in DSA layer, from Alexander Lobakin. 7) Allow hard IRQ deferral during NAPI, from Eric Dumazet. 8) Add sriov and vf support to hinic, from Luo bin. 9) Support Media Redundancy Protocol (MRP) in the bridging code, from Horatiu Vultur. 10) Support netmap in the nft_nat code, from Pablo Neira Ayuso. 11) Allow UDPv6 encapsulation of ESP in the ipsec code, from Sabrina Dubroca. Also add ipv6 support for espintcp. 12) Lots of ReST conversions of the networking documentation, from Mauro Carvalho Chehab. 13) Support configuration of ethtool rxnfc flows in bcmgenet driver, from Doug Berger. 14) Allow to dump cgroup id and filter by it in inet_diag code, from Dmitry Yakunin. 15) Add infrastructure to export netlink attribute policies to userspace, from Johannes Berg. 16) Several optimizations to sch_fq scheduler, from Eric Dumazet. 17) Fallback to the default qdisc if qdisc init fails because otherwise a packet scheduler init failure will make a device inoperative. From Jesper Dangaard Brouer. 18) Several RISCV bpf jit optimizations, from Luke Nelson. 19) Correct the return type of the ->ndo_start_xmit() method in several drivers, it's netdev_tx_t but many drivers were using 'int'. From Yunjian Wang. 20) Add an ethtool interface for PHY master/slave config, from Oleksij Rempel. 21) Add BPF iterators, from Yonghang Song. 22) Add cable test infrastructure, including ethool interfaces, from Andrew Lunn. Marvell PHY driver is the first to support this facility. 23) Remove zero-length arrays all over, from Gustavo A. R. Silva. 24) Calculate and maintain an explicit frame size in XDP, from Jesper Dangaard Brouer. 25) Add CAP_BPF, from Alexei Starovoitov. 26) Support terse dumps in the packet scheduler, from Vlad Buslov. 27) Support XDP_TX bulking in dpaa2 driver, from Ioana Ciornei. 28) Add devm_register_netdev(), from Bartosz Golaszewski. 29) Minimize qdisc resets, from Cong Wang. 30) Get rid of kernel_getsockopt and kernel_setsockopt in order to eliminate set_fs/get_fs calls. From Christoph Hellwig. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2517 commits) selftests: net: ip_defrag: ignore EPERM net_failover: fixed rollback in net_failover_open() Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv" Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv" vmxnet3: allow rx flow hash ops only when rss is enabled hinic: add set_channels ethtool_ops support selftests/bpf: Add a default $(CXX) value tools/bpf: Don't use $(COMPILE.c) bpf, selftests: Use bpf_probe_read_kernel s390/bpf: Use bcr 0,%0 as tail call nop filler s390/bpf: Maintain 8-byte stack alignment selftests/bpf: Fix verifier test selftests/bpf: Fix sample_cnt shared between two threads bpf, selftests: Adapt cls_redirect to call csum_level helper bpf: Add csum_level helper for fixing up csum levels bpf: Fix up bpf_skb_adjust_room helper's skb csum setting sfc: add missing annotation for efx_ef10_try_update_nic_stats_vf() crypto/chtls: IPv6 support for inline TLS Crypto/chcr: Fixes a coccinile check error Crypto/chcr: Fixes compilations warnings ...
Diffstat (limited to 'net/sched/sch_fq.c')
-rw-r--r--net/sched/sch_fq.c143
1 files changed, 102 insertions, 41 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4c060134c736..8f06a808c59a 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -66,22 +66,27 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
* in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
- unsigned long age; /* jiffies when flow was emptied, for gc */
+ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
};
struct rb_node fq_node; /* anchor in fq_root[] trees */
struct sock *sk;
+ u32 socket_hash; /* sk_hash */
int qlen; /* number of packets in flow queue */
+
+/* Second cache line, used in fq_dequeue() */
int credit;
- u32 socket_hash; /* sk_hash */
- struct fq_flow *next; /* next pointer in RR lists, or &detached */
+ /* 32bit hole on 64bit arches */
+
+ struct fq_flow *next; /* next pointer in RR lists */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
-};
+} ____cacheline_aligned_in_smp;
struct fq_flow_head {
struct fq_flow *first;
@@ -95,6 +100,7 @@ struct fq_sched_data {
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ u64 ktime_cache; /* copy of last ktime_get_ns() */
unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
@@ -104,12 +110,13 @@ struct fq_sched_data {
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u64 ce_threshold;
+ u64 horizon; /* horizon in ns */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
-
+ u8 horizon_drop;
u32 flows;
u32 inactive_flows;
u32 throttled_flows;
@@ -118,6 +125,8 @@ struct fq_sched_data {
u64 stat_internal_packets;
u64 stat_throttled;
u64 stat_ce_mark;
+ u64 stat_horizon_drops;
+ u64 stat_horizon_caps;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
@@ -126,20 +135,25 @@ struct fq_sched_data {
struct qdisc_watchdog watchdog;
};
-/* special value to mark a detached flow (not on old/new list) */
-static struct fq_flow detached, throttled;
-
+/*
+ * f->tail and f->age share the same location.
+ * We can use the low order bit to differentiate if this location points
+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
+ */
static void fq_flow_set_detached(struct fq_flow *f)
{
- f->next = &detached;
- f->age = jiffies;
+ f->age = jiffies | 1UL;
}
static bool fq_flow_is_detached(const struct fq_flow *f)
{
- return f->next == &detached;
+ return !!(f->age & 1UL);
}
+/* special value to mark a throttled flow (not on old/new list) */
+static struct fq_flow throttled;
+
static bool fq_flow_is_throttled(const struct fq_flow *f)
{
return f->next == &throttled;
@@ -204,9 +218,10 @@ static void fq_gc(struct fq_sched_data *q,
struct rb_root *root,
struct sock *sk)
{
- struct fq_flow *f, *tofree[FQ_GC_MAX];
struct rb_node **p, *parent;
- int fcnt = 0;
+ void *tofree[FQ_GC_MAX];
+ struct fq_flow *f;
+ int i, fcnt = 0;
p = &root->rb_node;
parent = NULL;
@@ -229,15 +244,18 @@ static void fq_gc(struct fq_sched_data *q,
p = &parent->rb_left;
}
+ if (!fcnt)
+ return;
+
+ for (i = fcnt; i > 0; ) {
+ f = tofree[--i];
+ rb_erase(&f->fq_node, root);
+ }
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
- while (fcnt) {
- struct fq_flow *f = tofree[--fcnt];
- rb_erase(&f->fq_node, root);
- kmem_cache_free(fq_flow_cachep, f);
- }
+ kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
}
static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
@@ -370,19 +388,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
}
}
-/* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+/* Remove one skb from flow queue.
+ * This skb must be the return value of prior fq_peek().
+ */
+static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
{
- struct sk_buff *skb = fq_peek(flow);
-
- if (skb) {
- fq_erase_head(sch, flow, skb);
- skb_mark_not_on_list(skb);
- flow->qlen--;
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
- }
- return skb;
+ fq_erase_head(sch, flow, skb);
+ skb_mark_not_on_list(skb);
+ flow->qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
}
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
@@ -390,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
struct rb_node **p, *parent;
struct sk_buff *head, *aux;
- fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
-
head = flow->head;
if (!head ||
fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
@@ -419,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
rb_insert_color(&skb->rbnode, &flow->t_root);
}
+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
+ const struct fq_sched_data *q)
+{
+ return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
+}
+
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -428,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
+ if (!skb->tstamp) {
+ fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+ } else {
+ /* Check if packet timestamp is too far in the future.
+ * Try first if our cached value, to avoid ktime_get_ns()
+ * cost in most cases.
+ */
+ if (fq_packet_beyond_horizon(skb, q)) {
+ /* Refresh our cache and check another time */
+ q->ktime_cache = ktime_get_ns();
+ if (fq_packet_beyond_horizon(skb, q)) {
+ if (q->horizon_drop) {
+ q->stat_horizon_drops++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+ q->stat_horizon_caps++;
+ skb->tstamp = q->ktime_cache + q->horizon;
+ }
+ }
+ fq_skb_cb(skb)->time_to_send = skb->tstamp;
+ }
+
f = fq_classify(skb, q);
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
q->stat_flows_plimit++;
@@ -494,11 +536,13 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
if (!sch->q.qlen)
return NULL;
- skb = fq_dequeue_head(sch, &q->internal);
- if (skb)
+ skb = fq_peek(&q->internal);
+ if (unlikely(skb)) {
+ fq_dequeue_skb(sch, &q->internal, skb);
goto out;
+ }
- now = ktime_get_ns();
+ q->ktime_cache = now = ktime_get_ns();
fq_check_throttled(q, now);
begin:
head = &q->new_flows;
@@ -532,14 +576,13 @@ begin:
fq_flow_set_throttled(q, f);
goto begin;
}
+ prefetch(&skb->end);
if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
- }
-
- skb = fq_dequeue_head(sch, f);
- if (!skb) {
+ fq_dequeue_skb(sch, f, skb);
+ } else {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && q->old_flows.first) {
@@ -550,7 +593,6 @@ begin:
}
goto begin;
}
- prefetch(&skb->end);
plen = qdisc_pkt_len(skb);
f->credit -= plen;
@@ -753,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -842,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_TIMER_SLACK])
q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
+ if (tb[TCA_FQ_HORIZON])
+ q->horizon = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_HORIZON]);
+
+ if (tb[TCA_FQ_HORIZON_DROP])
+ q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
+
if (!err) {
+
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
sch_tree_lock(sch);
@@ -895,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
+ q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
+ q->horizon_drop = 1; /* by default, drop packets beyond horizon */
+
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
@@ -912,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
u64 ce_threshold = q->ce_threshold;
+ u64 horizon = q->horizon;
struct nlattr *opts;
opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
@@ -921,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
do_div(ce_threshold, NSEC_PER_USEC);
+ do_div(horizon, NSEC_PER_USEC);
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
@@ -936,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
- nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack))
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
+ nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
+ nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -967,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
st.ce_mark = q->stat_ce_mark;
+ st.horizon_drops = q->stat_horizon_drops;
+ st.horizon_caps = q->stat_horizon_caps;
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));