From 8380c81d5c4fced6f4397795a5ae65758272bbfd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 May 2021 23:43:24 +0200 Subject: net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT __napi_schedule_irqoff() is an optimized version of __napi_schedule() which can be used where it is known that interrupts are disabled, e.g. in interrupt-handlers, spin_lock_irq() sections or hrtimer callbacks. On PREEMPT_RT enabled kernels this assumptions is not true. Force- threaded interrupt handlers and spinlocks are not disabling interrupts and the NAPI hrtimer callback is forced into softirq context which runs with interrupts enabled as well. Chasing all usage sites of __napi_schedule_irqoff() is a whack-a-mole game so make __napi_schedule_irqoff() invoke __napi_schedule() for PREEMPT_RT kernels. The callers of ____napi_schedule() in the networking core have been audited and are correct on PREEMPT_RT kernels as well. Reported-by: Juri Lelli Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Thomas Gleixner Reviewed-by: Juri Lelli Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 222b1d322c96..febb23708184 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6501,11 +6501,18 @@ EXPORT_SYMBOL(napi_schedule_prep); * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * - * Variant of __napi_schedule() assuming hard irqs are masked + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { - ____napi_schedule(this_cpu_ptr(&softnet_data), n); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); -- cgit v1.2.3 From 2b4cd14fd995e0a863b2ced4cba0bcd804d89ebc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Jun 2021 09:38:17 +0200 Subject: net/netif_receive_skb_core: Use migrate_disable() The preempt disable around do_xdp_generic() has been introduced in commit bbbe211c295ff ("net: rcu lock and preempt disable missing around generic xdp") For BPF it is enough to use migrate_disable() and the code was updated as it can be seen in commit 3c58482a382ba ("bpf: Provide bpf_prog_run_pin_on_cpu() helper") This is a leftover which was not converted. Use migrate_disable() before invoking do_xdp_generic(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ef8cf7619baf..439faadab0c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5277,9 +5277,9 @@ another_round: if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; - preempt_disable(); + migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - preempt_enable(); + migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; -- cgit v1.2.3 From c4fef01ba4793a85b2d38a472bddd1e3b56d9585 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 22 Jun 2021 14:49:56 +0800 Subject: net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK flag set, but queue discipline by-pass does not work for lockless qdisc because skb is always enqueued to qdisc even when the qdisc is empty, see __dev_xmit_skb(). This patch calls sch_direct_xmit() to transmit the skb directly to the driver for empty lockless qdisc, which aviod enqueuing and dequeuing operation. As qdisc->empty is not reliable to indicate a empty qdisc because there is a time window between enqueuing and setting qdisc->empty. So we use the MISSED state added in commit a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc"), which indicate there is lock contention, suggesting that it is better not to do the qdisc bypass in order to avoid packet out of order problem. In order to make MISSED state reliable to indicate a empty qdisc, we need to ensure that testing and clearing of MISSED state is within the protection of qdisc->seqlock, only setting MISSED state can be done without the protection of qdisc->seqlock. A MISSED state testing is added without the protection of qdisc->seqlock to aviod doing unnecessary spin_trylock() for contention case. As the enqueuing is not within the protection of qdisc->seqlock, there is still a potential data race as mentioned by Jakub [1]: thread1 thread2 thread3 qdisc_run_begin() # true qdisc_run_begin(q) set(MISSED) pfifo_fast_dequeue clear(MISSED) # recheck the queue qdisc_run_end() enqueue skb1 qdisc empty # true qdisc_run_begin() # true sch_direct_xmit() # skb2 qdisc_run_begin() set(MISSED) When above happens, skb1 enqueued by thread2 is transmited after skb2 is transmited by thread3 because MISSED state setting and enqueuing is not under the qdisc->seqlock. If qdisc bypass is disabled, skb1 has better chance to be transmited quicker than skb2. This patch does not take care of the above data race, because we view this as similar as below: Even at the same time CPU1 and CPU2 write the skb to two socket which both heading to the same qdisc, there is no guarantee that which skb will hit the qdisc first, because there is a lot of factor like interrupt/softirq/cache miss/scheduling afffecting that. There are below cases that need special handling: 1. When MISSED state is cleared before another round of dequeuing in pfifo_fast_dequeue(), and __qdisc_run() might not be able to dequeue all skb in one round and call __netif_schedule(), which might result in a non-empty qdisc without MISSED set. In order to avoid this, the MISSED state is set for lockless qdisc and __netif_schedule() will be called at the end of qdisc_run_end. 2. The MISSED state also need to be set for lockless qdisc instead of calling __netif_schedule() directly when requeuing a skb for a similar reason. 3. For netdev queue stopped case, the MISSED case need clearing while the netdev queue is stopped, otherwise there may be unnecessary __netif_schedule() calling. So a new DRAINING state is added to indicate this case, which also indicate a non-empty qdisc. 4. As there is already netif_xmit_frozen_or_stopped() checking in dequeue_skb() and sch_direct_xmit(), which are both within the protection of qdisc->seqlock, but the same checking in __dev_xmit_skb() is without the protection, which might cause empty indication of a lockless qdisc to be not reliable. So remove the checking in __dev_xmit_skb(), and the checking in the protection of qdisc->seqlock seems enough to avoid the cpu consumption problem for netdev queue stopped case. 1. https://lkml.org/lkml/2021/5/29/215 Acked-by: Jakub Kicinski Tested-by: Vladimir Oltean # flexcan Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- include/net/sch_generic.h | 16 +++++++++++++--- net/core/dev.c | 27 +++++++++++++++++++++++++-- net/sched/sch_generic.c | 20 ++++++++++++++++---- 3 files changed, 54 insertions(+), 9 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3ed6bcc4be72..177f240d59d4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -37,8 +37,15 @@ enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, + __QDISC_STATE_DRAINING, }; +#define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) +#define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) + +#define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ + QDISC_STATE_DRAINING) + struct qdisc_size_table { struct rcu_head rcu; struct list_head list; @@ -145,6 +152,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) +{ + return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); +} + static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; @@ -206,10 +218,8 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) spin_unlock(&qdisc->seqlock); if (unlikely(test_bit(__QDISC_STATE_MISSED, - &qdisc->state))) { - clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + &qdisc->state))) __netif_schedule(qdisc); - } } else { write_seqcount_end(&qdisc->running); } diff --git a/net/core/dev.c b/net/core/dev.c index 50531a2d0b20..991d09b67bd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3852,10 +3852,33 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = q->enqueue(skb, q, &to_free) & + NET_XMIT_MASK; + __qdisc_run(q); + qdisc_run_end(q); + + goto no_lock_out; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + qdisc_run_end(q); + return NET_XMIT_SUCCESS; + } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - if (likely(!netif_xmit_frozen_or_stopped(txq))) - qdisc_run(q); + qdisc_run(q); +no_lock_out: if (unlikely(to_free)) kfree_skb_list(to_free); return rc; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e9c0afc8becc..9984ccc45946 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -52,6 +52,8 @@ static void qdisc_maybe_clear_missed(struct Qdisc *q, */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); + else + set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ @@ -164,9 +166,13 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) skb = next; } - if (lock) + + if (lock) { spin_unlock(lock); - __netif_schedule(q); + set_bit(__QDISC_STATE_MISSED, &q->state); + } else { + __netif_schedule(q); + } } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -409,7 +415,11 @@ void __qdisc_run(struct Qdisc *q) while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { - __netif_schedule(q); + if (q->flags & TCQ_F_NOLOCK) + set_bit(__QDISC_STATE_MISSED, &q->state); + else + __netif_schedule(q); + break; } } @@ -698,13 +708,14 @@ retry: if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && - test_bit(__QDISC_STATE_MISSED, &qdisc->state)) { + READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. @@ -1222,6 +1233,7 @@ static void dev_reset_queue(struct net_device *dev, spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } -- cgit v1.2.3 From 127d7355abb355b05ff4b42d6e18cc97aa9d1d11 Mon Sep 17 00:00:00 2001 From: Tanner Love Date: Mon, 28 Jun 2021 09:50:07 -0400 Subject: net: update netdev_rx_csum_fault() print dump only once Printing this stack dump multiple times does not provide additional useful information, and consumes time in the data path. Printing once is sufficient. Changes v2: Format indentation properly Signed-off-by: Tanner Love Acked-by: Eric Dumazet Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 991d09b67bd9..d609366da95c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -148,6 +148,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -3487,13 +3488,16 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) +{ + pr_err("%s: hw csum failure\n", dev ? dev->name : ""); + skb_dump(KERN_ERR, skb, true); + dump_stack(); +} + void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (net_ratelimit()) { - pr_err("%s: hw csum failure\n", dev ? dev->name : ""); - skb_dump(KERN_ERR, skb, true); - dump_stack(); - } + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif -- cgit v1.2.3