From cfe43f478b79ba45573ca22d52d0d8823be068fa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 12 Nov 2021 18:52:01 +0000 Subject: preempt/dynamic: Introduce preemption model accessors CONFIG_PREEMPT{_NONE, _VOLUNTARY} designate either: o The build-time preemption model when !PREEMPT_DYNAMIC o The default boot-time preemption model when PREEMPT_DYNAMIC IOW, using those on PREEMPT_DYNAMIC kernels is meaningless - the actual model could have been set to something else by the "preempt=foo" cmdline parameter. Same problem applies to CONFIG_PREEMPTION. Introduce a set of helpers to determine the actual preemption model used by the live kernel. Suggested-by: Marco Elver Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20211112185203.280040-3-valentin.schneider@arm.com --- kernel/sched/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..068c088e9584 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8409,6 +8409,18 @@ static void __init preempt_dynamic_init(void) } } +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } -- cgit v1.2.3 From 8ed00760203d8018bee042fbfe8e076579be2c2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2022 09:52:44 -0800 Subject: srcu: Tighten cleanup_srcu_struct() GP checks Currently, cleanup_srcu_struct() checks for a grace period in progress, but it does not check for a grace period that has not yet started but which might start at any time. Such a situation could result in a use-after-free bug, so this commit adds a check for a grace period that is needed but not yet started to cleanup_srcu_struct(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..d30e4db04506 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -382,9 +382,11 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(ssp->sda); -- cgit v1.2.3 From 7b9e9b5856e188c1b3ff51185f3600ee79b4ab41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jan 2022 13:39:33 -0800 Subject: srcu: Make srcu_funnel_gp_start() cache ->mynode in snp_leaf Currently, the srcu_funnel_gp_start() walks its local variable snp up the tree and reloads sdp->mynode whenever it is necessary to check whether it is still at the leaf srcu_node level. This works, but is a bit more obtuse than absolutely necessary. In addition, upcoming commits will dynamically size srcu_struct structures, in which case sdp->mynode will no longer necessarily be a constant, and this commit helps prepare for that dynamic sizing. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d30e4db04506..7d13e35e5d27 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -632,20 +632,21 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + struct srcu_node *snp; + struct srcu_node *snp_leaf = sdp->mynode; unsigned long snp_seq; /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) + if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { + if (snp == snp_leaf && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -656,7 +657,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, return; } snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) + if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); -- cgit v1.2.3 From 994f706872e6ce080506bd795ecf783d5b617de6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 09:46:57 -0800 Subject: srcu: Make Tree SRCU able to operate without snp_node array This commit makes Tree SRCU able to operate without an snp_node array, that is, when the srcu_data structures' ->mynode pointers are NULL. This can result in high contention on the srcu_struct structure's ->lock, but only when there are lots of call_srcu(), synchronize_srcu(), and synchronize_srcu_expedited() calls. Note that when there is no snp_node array, all SRCU callbacks use CPU 0's callback queue. This is optimal in the common case of low update-side load because it removes the need to search each CPU for the single callback that made the grace period happen. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 14 +++- kernel/rcu/srcutree.c | 203 ++++++++++++++++++++++++++--------------------- 2 files changed, 124 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 4025840ba9a3..8d1da136a93a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -63,8 +63,9 @@ struct srcu_struct { struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ + int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters */ + spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -83,6 +84,17 @@ struct srcu_struct { struct lockdep_map dep_map; }; +/* Values for size state variable (->srcu_size_state). */ +#define SRCU_SIZE_SMALL 0 +#define SRCU_SIZE_ALLOC 1 +#define SRCU_SIZE_WAIT_BARRIER 2 +#define SRCU_SIZE_WAIT_CALL 3 +#define SRCU_SIZE_WAIT_CBS1 4 +#define SRCU_SIZE_WAIT_CBS2 5 +#define SRCU_SIZE_WAIT_CBS3 6 +#define SRCU_SIZE_WAIT_CBS4 7 +#define SRCU_SIZE_BIG 8 + /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7d13e35e5d27..e23696edd43b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -152,16 +152,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -175,6 +176,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) return -ENOMEM; init_srcu_struct_nodes(ssp); + ssp->srcu_size_state = SRCU_SIZE_BIG; ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ @@ -391,6 +393,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } free_percpu(ssp->sda); ssp->sda = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -439,6 +442,10 @@ static void srcu_gp_start(struct srcu_struct *ssp) struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -539,38 +546,40 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -599,18 +608,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp { unsigned long flags; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + return; + spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); @@ -633,36 +643,37 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); struct srcu_node *snp; - struct srcu_node *snp_leaf = sdp->mynode; + struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { - snp_seq = snp->srcu_have_cbs[idx]; - if (snp == snp_leaf && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == snp_leaf && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm + ? SRCU_INTERVAL + : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == snp_leaf) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_rcu_node(ssp, flags); @@ -820,7 +831,10 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); @@ -840,7 +854,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s); srcu_read_unlock(ssp, idx); return s; } @@ -1100,6 +1114,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1107,7 +1143,6 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1123,27 +1158,11 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) -- cgit v1.2.3 From 2ec303113d978931ef368886c4c6bc854493e8bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jan 2022 16:13:52 -0800 Subject: srcu: Dynamically allocate srcu_node array This commit shrinks the srcu_struct structure by converting its ->node field from a fixed-size compile-time array to a pointer to a dynamically allocated array. In kernels built with large values of NR_CPUS that boot on systems with smaller numbers of CPUs, this can save significant memory. [ paulmck: Apply kernel test robot feedback. ] Reported-by: A cast of thousands Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 65 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8d1da136a93a..8501b6b45941 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -60,7 +60,7 @@ struct srcu_node { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e23696edd43b..e98cc218e42b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "rcu.h" @@ -75,12 +76,42 @@ do { \ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp) { int cpu; int i; @@ -92,6 +123,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -129,30 +163,20 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* @@ -163,6 +187,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -175,8 +200,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); - ssp->srcu_size_state = SRCU_SIZE_BIG; + init_srcu_struct_data(ssp); + if (!init_srcu_struct_nodes(ssp)) { + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + ssp->srcu_size_state = SRCU_SIZE_BIG; + } ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ @@ -393,6 +426,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->node); + ssp->node = NULL; ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -- cgit v1.2.3 From e2f638365dd6283b7df1cb5e82f5b2746359f062 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 15:41:32 -0800 Subject: srcu: Add size-state transitioning code This is just dead code at the moment, and will be used once the state-transition code is activated. Because srcu_barrier() must be aware of transition before call_srcu(), the state machine waits for an SRCU grace period before callbacks are queued to the non-CPU-0 queues. This requres that portions of srcu_barrier() be enclosed in an SRCU read-side critical section. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e98cc218e42b..00f9aed6f7b9 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -562,6 +562,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -629,6 +630,15 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -1178,6 +1188,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1193,11 +1204,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + idx = srcu_read_lock(ssp); if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); else for_each_possible_cpu(cpu) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) -- cgit v1.2.3 From 3bedebcf63c2ad7396f1be138bbef91a402f33cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 17:05:51 -0800 Subject: srcu: Make rcutorture dump the SRCU size state This commit adds the numeric and string version of ->srcu_size_state to the Tree-SRCU-specific portion of the rcutorture output. [ paulmck: Apply feedback from kernel test robot and Dan Carpenter. ] [ quic_neeraju: Apply feedback from Jiapeng Chong. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 00f9aed6f7b9..0874e3be6a5a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1407,15 +1407,33 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx], idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; -- cgit v1.2.3 From aeb9b39b8f4aac1233302c53a1fd99a73fd2c262 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jan 2022 16:01:26 -0800 Subject: srcu: Compute snp_seq earlier in srcu_funnel_gp_start() Currently, srcu_funnel_gp_start() tests snp->srcu_have_cbs[idx] and then separately assigns it to the snp_seq local variable. This commit does the assignment earlier to simplify the code a bit. While in the area, this commit also takes advantage of the 100-character line limit to put the call to srcu_schedule_cbs_sdp() on a single line. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0874e3be6a5a..fec608f69962 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -697,15 +697,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { - snp_seq = snp->srcu_have_cbs[idx]; + snp_seq = snp->srcu_have_cbs[idx]; + if (ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == snp_leaf && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); return; } if (!do_norm) -- cgit v1.2.3 From cbdc98e93efa7bbf6f2fcd68c73df82c37b5fa65 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jan 2022 17:03:06 -0800 Subject: srcu: Use invalid initial value for srcu_node GP sequence numbers Currently, tree SRCU relies on the srcu_node structures being initialized at the same time that the srcu_struct itself is initialized, and thus use the initial grace-period sequence number as the initial value for the srcu_node structure's ->srcu_have_cbs[] and ->srcu_gp_seq_needed_exp fields. Although this has a high probability of also working when the srcu_node array is allocated and initialized at some random later time, it would be better to avoid leaving such things to chance. This commit therefore initializes these fields with 0x2, which is a recognizable invalid value. It then adds the required checks for this invalid value in order to avoid confusion on long-running kernels (especially those on 32-bit systems) that allocate and initialize srcu_node arrays late in life. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fec608f69962..155c430c6a73 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -107,6 +107,18 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) } } +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + /* * Allocated and initialize SRCU combining tree. Returns @true if * allocation succeeded and @false otherwise. @@ -139,10 +151,10 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -386,8 +398,7 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) return 0; return SRCU_INTERVAL; } @@ -561,6 +572,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; int ss_state; @@ -594,7 +606,8 @@ static void srcu_gp_end(struct srcu_struct *ssp) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; @@ -652,14 +665,17 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; if (snp) for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { spin_unlock_irqrestore_rcu_node(snp, flags); return; } @@ -687,6 +703,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + unsigned long sgsne; struct srcu_node *snp; struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); unsigned long snp_seq; @@ -698,7 +715,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (ULONG_CMP_GE(snp_seq, s)) { + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); @@ -713,7 +730,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, snp->srcu_have_cbs[idx] = s; if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } -- cgit v1.2.3 From 0b56f953908a751716f2c8f907942674b60d8db5 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 22 Feb 2022 11:39:01 +0530 Subject: srcu: Ensure snp nodes tree is fully initialized before traversal For configurations where snp node tree is not initialized at init time (added in subsequent commits), srcu_funnel_gp_start() and srcu_funnel_exp_start() can potential traverse and observe the snp nodes' transient (uninitialized) states. This can potentially happen, when init_srcu_struct_nodes() initialization of sdp->mynode races with srcu_funnel_gp_start() and srcu_funnel_exp_start() Consider the case below where srcu_funnel_gp_start() observes sdp->mynode to be not NULL and uses an uninitialized sdp->grpmask P1 P2 init_srcu_struct_nodes() void srcu_funnel_gp_start(...) { for_each_possible_cpu(cpu) { ... sdp->mynode = &snp_first[...]; for (snp = sdp->mynode;...) struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode) ... if (snp_leaf) { for (snp = snp_leaf; ...) ... if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } } Similarly, init_srcu_struct_nodes() and srcu_funnel_exp_start() can race, where srcu_funnel_exp_start() could observe state of snp lock before spin_lock_init(). P1 P2 init_srcu_struct_nodes() void srcu_funnel_exp_start(...) { srcu_for_each_node_breadth_first(ssp, snp) { for (; ...) { spin_lock_...(snp, ) spin_lock_init(&ACCESS_PRIVATE(snp, lock)); ... } for_each_possible_cpu(cpu) { ... sdp->mynode = &snp_first[...]; To avoid these issues, ensure that snp node tree initialization is complete i.e. after SRCU_SIZE_WAIT_BARRIER srcu_size_state is reached, before traversing the tree. Given that srcu_funnel_gp_start() and srcu_funnel_exp_start() are called within SRCU read side critical sections, this check is safe, in the sense that all callbacks are enqueued on CPU0 srcu_cblist until SRCU_SIZE_WAIT_CALL is entered, and these read side critical sections (containing srcu_funnel_gp_start() and srcu_funnel_exp_start()) need to complete, before SRCU_SIZE_WAIT_CALL is reached. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 155c430c6a73..2e7ed67646db 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -705,9 +705,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); unsigned long sgsne; struct srcu_node *snp; - struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); + struct srcu_node *snp_leaf; unsigned long snp_seq; + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { @@ -889,10 +895,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL) + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, 0); else sdp = raw_cpu_ptr(ssp->sda); @@ -912,10 +921,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } -- cgit v1.2.3 From c69a00a12e26cf4faffdcdb340cb2d059b61d57e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jan 2022 15:41:10 -0800 Subject: srcu: Add boot-time control over srcu_node array allocation This commit adds an srcu_tree.convert_to_big kernel parameter that either refuses to convert at all (0), converts immediately at init_srcu_struct() time (1), or lets rcutorture convert it (2). An addition contention-based dynamic conversion choice will be added, along with documentation. [ paulmck: Apply callback-scanning feedback from Neeraj Upadhyay. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 13 +++++++ kernel/rcu/srcutree.c | 48 ++++++++++++++++--------- 2 files changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..1f1fcac7777d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5608,6 +5608,19 @@ off: Disable mitigation and remove performance impact to RDRAND and RDSEED + srcutree.convert_to_big [KNL] + Specifies under what conditions an SRCU tree + srcu_struct structure will be converted to big + form, that is, with an rcu_node tree: + + 0: Never. + 1: At init_srcu_struct() time. + 2: When rcutorture decides to. + + Either way, the srcu_node tree will be sized based + on the actual runtime number of CPUs (nr_cpu_ids) + instead of the compile-time CONFIG_NR_CPUS. + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2e7ed67646db..9dd048989027 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -39,6 +39,15 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all (default). + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + */ +static int convert_to_big; +module_param(convert_to_big, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -123,7 +132,7 @@ static inline bool srcu_invl_snp_seq(unsigned long s) * Allocated and initialize SRCU combining tree. Returns @true if * allocation succeeded and @false otherwise. */ -static bool init_srcu_struct_nodes(struct srcu_struct *ssp) +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -135,7 +144,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); if (!ssp->node) return false; @@ -213,17 +222,19 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) return -ENOMEM; init_srcu_struct_data(ssp); - if (!init_srcu_struct_nodes(ssp)) { - if (!is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - return -ENOMEM; - } - } else { - ssp->srcu_size_state = SRCU_SIZE_BIG; - } ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -594,7 +605,8 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) { + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); @@ -603,13 +615,16 @@ static void srcu_gp_end(struct srcu_struct *ssp) cbs = false; last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); sgsne = snp->srcu_gp_seq_needed_exp; if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); if (cbs) @@ -645,10 +660,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Transition to big if needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { if (ss_state == SRCU_SIZE_ALLOC) - init_srcu_struct_nodes(ssp); + init_srcu_struct_nodes(ssp, GFP_KERNEL); else smp_store_release(&ssp->srcu_size_state, ss_state + 1); } @@ -1494,6 +1508,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) s1 += c1; } pr_cont(" T(%ld,%ld)\n", s0, s1); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); -- cgit v1.2.3 From 4a230f8046454df18139ed1232f1a1e8a6dd36c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 11:43:11 -0800 Subject: srcu: Avoid NULL dereference in srcu_torture_stats_print() You really shouldn't invoke srcu_torture_stats_print() after invoking cleanup_srcu_struct(), but there is really no reason to get a compiler-obfuscated per-CPU-variable NULL pointer dereference as the diagnostic. This commit therefore checks for NULL ->sda and makes a more polite console-message complaint in that case. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 62 ++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9dd048989027..b7138dbe1a2d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1477,37 +1477,43 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) idx = ssp->srcu_idx & 0x1; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; - pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):", + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, - srcu_size_state_name[ss_state_idx], idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); } -- cgit v1.2.3 From 46470cf85d2b61abd37c6f66c4dacc1bc510d10f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 13:20:49 -0800 Subject: srcu: Prevent cleanup_srcu_struct() from freeing non-dynamic ->sda When an srcu_struct structure is created (but not in a kernel module) by DEFINE_SRCU() and friends, the per-CPU srcu_data structure is statically allocated. In all other cases, that structure is obtained from alloc_percpu(), in which case cleanup_srcu_struct() must invoke free_percpu() on the resulting ->sda pointer in the srcu_struct pointer. Which it does. Except that it also invokes free_percpu() on the ->sda pointer referencing the statically allocated per-CPU srcu_data structures. Which free_percpu() is surprisingly OK with. This commit nevertheless stops cleanup_srcu_struct() from freeing statically allocated per-CPU srcu_data structures. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 1 + kernel/rcu/srcutree.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44e998643f48..44bd204498a1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -73,6 +73,7 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b7138dbe1a2d..7209fd95dde9 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -217,6 +217,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) @@ -226,7 +227,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!is_static) { + if (!ssp->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; return -ENOMEM; @@ -446,8 +447,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } kfree(ssp->node); ssp->node = NULL; ssp->srcu_size_state = SRCU_SIZE_SMALL; -- cgit v1.2.3 From ee5e2448bceb9400aa27207f0c0220f9dedd85eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 13:47:42 -0800 Subject: srcu: Explain srcu_funnel_gp_start() call to list_add() is safe This commit adds a comment explaining why an unprotected call to list_add() from srcu_funnel_gp_start() can be safe. TL;DR: It is only called during very early boot when we don't have no steeking concurrency! Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7209fd95dde9..64993a172cff 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -776,6 +776,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, srcu_get_delay(ssp)); -- cgit v1.2.3 From 99659f64b14e55cfa48980f5396f83820bafd028 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 14:56:39 -0800 Subject: srcu: Create concurrency-safe helper for initiating size transition Once there are contention-initiated size transitions, it will be possible for rcutorture to initiate a transition at the same time as a contention-initiated transition. This commit therefore creates a concurrency-safe helper function named srcu_transition_to_big() to safely initiate size transitions. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 64993a172cff..c9460374d437 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -272,6 +272,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -1523,8 +1542,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) } pr_cont(" T(%ld,%ld)\n", s0, s1); } - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); + if (convert_to_big == 2) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); -- cgit v1.2.3 From 9f2e91d94c91558e3764fe4e01c5e6281a90f239 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 20:32:05 -0800 Subject: srcu: Add contention-triggered addition of srcu_node tree This commit instruments the acquisitions of the srcu_struct structure's ->lock, enabling the initiation of a transition from SRCU_SIZE_SMALL to SRCU_SIZE_BIG when sufficient contention is experienced. The instrumentation counts the number of trylock failures within the confines of a single jiffy. If that number exceeds the value specified by the srcutree.small_contention_lim kernel boot parameter (which defaults to 100), and if the value specified by the srcutree.convert_to_big kernel boot parameter has the 0x10 bit set (defaults to 0), then a transition will be automatically initiated. By default, there will never be any transitions, so that none of the srcu_struct structures ever gains an srcu_node array. The useful values for srcutree.convert_to_big are: 0x00: Never convert. 0x01: Always convert at init_srcu_struct() time. 0x02: Convert when rcutorture prints its first round of statistics. 0x03: Decide conversion approach at boot given system size. 0x10: Convert if contention is encountered. 0x12: Convert if contention is encountered or when rcutorture prints its first round of statistics, whichever comes first. The value 0x11 acts the same as 0x01 because the conversion happens before there is any chance of contention. [ paulmck: Apply "static" feedback from kernel test robot. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 9 ++ include/linux/srcutree.h | 2 + kernel/rcu/srcutree.c | 107 ++++++++++++++++++------ 3 files changed, 94 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1f1fcac7777d..177e688768c0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5616,6 +5616,7 @@ 0: Never. 1: At init_srcu_struct() time. 2: When rcutorture decides to. + 0x1X: Above plus if high contention. Either way, the srcu_node tree will be sized based on the actual runtime number of CPUs (nr_cpu_ids) @@ -5638,6 +5639,14 @@ expediting. Set to zero to disable automatic expediting. + srcutree.small_contention_lim [KNL] + Specifies the number of update-side contention + events per jiffy will be tolerated before + initiating a conversion of an srcu_struct + structure to big form. Note that the value of + srcutree.convert_to_big must have the 0x10 bit + set for contention-based conversions to occur. + ssbd= [ARM64,HW] Speculative Store Bypass Disable control diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44bd204498a1..1b9ff4ed37e4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -72,6 +72,8 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ + unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c9460374d437..0bc6a0a3edee 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -41,13 +41,29 @@ module_param(counter_wrap_check, ulong, 0444); /* * Control conversion to SRCU_SIZE_BIG: - * 0: Don't convert at all (default). - * 1: Convert at init_srcu_struct() time. - * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 0: Don't convert at all (default). + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape. + * 0x1x: Convert when excessive contention encountered. */ -static int convert_to_big; +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_NONE; module_param(convert_to_big, int, 0444); +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -58,31 +74,40 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* * Initialize SRCU per-CPU data. Note that statically allocated @@ -225,7 +250,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { if (!ssp->sda_is_static) { free_percpu(ssp->sda); @@ -272,6 +297,15 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + /* * Initiate an idempotent transition to SRCU_SIZE_BIG. */ @@ -287,10 +321,35 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) spin_unlock_irqrestore_rcu_node(ssp, flags); return; } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + __srcu_transition_to_big(ssp); spin_unlock_irqrestore_rcu_node(ssp, flags); } +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + unsigned long j; + + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -718,7 +777,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -779,7 +838,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -1542,7 +1601,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) } pr_cont(" T(%ld,%ld)\n", s0, s1); } - if (convert_to_big == 2) + if (SRCU_SIZING_IS_TORTURE()) srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); -- cgit v1.2.3 From beb84099f1cf51e005e5df77d05b1644e490409e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:04 +0100 Subject: rcu: Remove rcu_is_nocb_cpu() The rcu_is_nocb_cpu() function is no longer used, so this commmit removes it. Reported-by: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 -- kernel/rcu/tree_nocb.h | 8 -------- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..4c53a2ccf711 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -538,10 +538,8 @@ extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 636d0546a4e9..02e1d05a11fc 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -215,14 +215,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) -- cgit v1.2.3 From 8d2aaa9b7c290e766a41f29c71ec72192851d538 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Feb 2022 14:23:39 +0100 Subject: rcu/nocb: Move rcu_nocb_is_setup to rcu_state This commit moves the RCU nocb initialization witness within rcu_state to consolidate RCU's global state. Reported-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 13 +++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..f6a3d54585c9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -364,6 +364,7 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 02e1d05a11fc..3c00240833d6 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -1172,10 +1169,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1233,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct task_struct *t; struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1279,7 +1276,7 @@ static void __init rcu_spawn_nocb_kthreads(void) { int cpu; - if (rcu_nocb_is_setup) { + if (rcu_state.nocb_is_setup) { for_each_online_cpu(cpu) rcu_spawn_cpu_nocb_kthread(cpu); } -- cgit v1.2.3 From 2eed973adc6e749439730e53e6220b122398d319 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:06 +0100 Subject: rcu: Assume rcu_init() is called before smp The rcu_init() function is called way before SMP is initialized and therefore only the boot CPU should be online at this stage. Simplify the boot per-cpu initialization accordingly. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..e6a9e5744e45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4782,7 +4782,7 @@ static void __init kfree_rcu_batch_init(void) void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); @@ -4802,11 +4802,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); -- cgit v1.2.3 From 3352911fa9b47a90165e5c6fed440048c55146d1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:07 +0100 Subject: rcu: Initialize boost kthread only for boot node prior SMP initialization The rcu_spawn_gp_kthread() function is called as an early initcall, which means that SMP initialization hasn't happened yet and only the boot CPU is online. Therefore, create only the boost kthread for the leaf node of the boot CPU. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 16 ---------------- 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e6a9e5744e45..70b33c55d39a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4480,6 +4480,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4498,7 +4499,9 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); return 0; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f6a3d54585c9..b2a0f2613ab9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -422,7 +422,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..b139635f33bd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1226,18 +1226,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1263,10 +1251,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* -- cgit v1.2.3 From 87c5adf06bfbf14c9d13e59d5d174ff5f2aafc0e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:08 +0100 Subject: rcu/nocb: Initialize nocb kthreads only for boot CPU prior SMP initialization The rcu_spawn_gp_kthread() function is called as an early initcall, which means that SMP initialization hasn't happened yet and only the boot CPU is online. Therefore, create only the NOCB kthreads related to the boot CPU. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- kernel/rcu/tree.h | 1 - kernel/rcu/tree_nocb.h | 20 -------------------- 3 files changed, 5 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 70b33c55d39a..9f7441a78f90 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4498,9 +4498,13 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); /* This is a pre-SMP initcall, we expect a single CPU */ WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); return 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b2a0f2613ab9..25dc4166f218 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -439,7 +439,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3c00240833d6..46694e13398a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1266,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_state.nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); - } -} - /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_gp_stride = -1; module_param(rcu_nocb_gp_stride, int, 0444); @@ -1538,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } -- cgit v1.2.3 From f75fd4b9221d93177c50dcfde671b2e907f53e86 Mon Sep 17 00:00:00 2001 From: Padmanabha Srinivasaiah Date: Thu, 17 Feb 2022 16:25:19 +0100 Subject: rcu-tasks: Fix race in schedule and flush work While booting secondary CPUs, cpus_read_[lock/unlock] is not keeping online cpumask stable. The transient online mask results in below calltrace. [ 0.324121] CPU1: Booted secondary processor 0x0000000001 [0x410fd083] [ 0.346652] Detected PIPT I-cache on CPU2 [ 0.347212] CPU2: Booted secondary processor 0x0000000002 [0x410fd083] [ 0.377255] Detected PIPT I-cache on CPU3 [ 0.377823] CPU3: Booted secondary processor 0x0000000003 [0x410fd083] [ 0.379040] ------------[ cut here ]------------ [ 0.383662] WARNING: CPU: 0 PID: 10 at kernel/workqueue.c:3084 __flush_work+0x12c/0x138 [ 0.384850] Modules linked in: [ 0.385403] CPU: 0 PID: 10 Comm: rcu_tasks_rude_ Not tainted 5.17.0-rc3-v8+ #13 [ 0.386473] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT) [ 0.387289] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 0.388308] pc : __flush_work+0x12c/0x138 [ 0.388970] lr : __flush_work+0x80/0x138 [ 0.389620] sp : ffffffc00aaf3c60 [ 0.390139] x29: ffffffc00aaf3d20 x28: ffffffc009c16af0 x27: ffffff80f761df48 [ 0.391316] x26: 0000000000000004 x25: 0000000000000003 x24: 0000000000000100 [ 0.392493] x23: ffffffffffffffff x22: ffffffc009c16b10 x21: ffffffc009c16b28 [ 0.393668] x20: ffffffc009e53861 x19: ffffff80f77fbf40 x18: 00000000d744fcc9 [ 0.394842] x17: 000000000000000b x16: 00000000000001c2 x15: ffffffc009e57550 [ 0.396016] x14: 0000000000000000 x13: ffffffffffffffff x12: 0000000100000000 [ 0.397190] x11: 0000000000000462 x10: ffffff8040258008 x9 : 0000000100000000 [ 0.398364] x8 : 0000000000000000 x7 : ffffffc0093c8bf4 x6 : 0000000000000000 [ 0.399538] x5 : 0000000000000000 x4 : ffffffc00a976e40 x3 : ffffffc00810444c [ 0.400711] x2 : 0000000000000004 x1 : 0000000000000000 x0 : 0000000000000000 [ 0.401886] Call trace: [ 0.402309] __flush_work+0x12c/0x138 [ 0.402941] schedule_on_each_cpu+0x228/0x278 [ 0.403693] rcu_tasks_rude_wait_gp+0x130/0x144 [ 0.404502] rcu_tasks_kthread+0x220/0x254 [ 0.405264] kthread+0x174/0x1ac [ 0.405837] ret_from_fork+0x10/0x20 [ 0.406456] irq event stamp: 102 [ 0.406966] hardirqs last enabled at (101): [] _raw_spin_unlock_irq+0x78/0xb4 [ 0.408304] hardirqs last disabled at (102): [] el1_dbg+0x24/0x5c [ 0.409410] softirqs last enabled at (54): [] local_bh_enable+0xc/0x2c [ 0.410645] softirqs last disabled at (50): [] local_bh_disable+0xc/0x2c [ 0.411890] ---[ end trace 0000000000000000 ]--- [ 0.413000] smp: Brought up 1 node, 4 CPUs [ 0.413762] SMP: Total of 4 processors activated. [ 0.414566] CPU features: detected: 32-bit EL0 Support [ 0.415414] CPU features: detected: 32-bit EL1 Support [ 0.416278] CPU features: detected: CRC32 instructions [ 0.447021] Callback from call_rcu_tasks_rude() invoked. [ 0.506693] Callback from call_rcu_tasks() invoked. This commit therefore fixes this issue by applying a single-CPU optimization to the RCU Tasks Rude grace-period process. The key point here is that the purpose of this RCU flavor is to force a schedule on each online CPU since some past event. But the rcu_tasks_rude_wait_gp() function runs in the context of the RCU Tasks Rude's grace-period kthread, so there must already have been a context switch on the current CPU since the call to either synchronize_rcu_tasks_rude() or call_rcu_tasks_rude(). So if there is only a single CPU online, RCU Tasks Rude's grace-period kthread does not need to anything at all. It turns out that the rcu_tasks_rude_wait_gp() function's call to schedule_on_each_cpu() causes problems during early boot. During that time, there is only one online CPU, namely the boot CPU. Therefore, applying this single-CPU optimization fixes early-boot instances of this problem. Link: https://lore.kernel.org/lkml/20220210184319.25009-1-treasure4paddy@gmail.com/T/ Suggested-by: Paul E. McKenney Signed-off-by: Padmanabha Srinivasaiah Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 99cf3a13954c..b43320b149d2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -950,6 +950,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } -- cgit v1.2.3 From f25390033fa2445cdc4d6cf8243a9b85d942845f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Feb 2022 16:01:12 -0800 Subject: rcu-tasks: Print pre-stall-warning informational messages RCU-tasks stall-warning messages are printed after the grace period is ten minutes old. Unfortunately, most of us will have rebooted the system in response to an apparently-hung command long before the ten minutes is up, and will thus see what looks to be a silent hang. This commit therefore adds pr_info() messages that are printed earlier. These should avoid being classified as errors, but should give impatient users a hint. These are controlled by new rcupdate.rcu_task_stall_info and rcupdate.rcu_task_stall_info_mult kernel-boot parameters. The former defines the initial delay in jiffies (defaulting to 10 seconds) and the latter defines the multiplier (defaulting to 3). Thus, by default, the first message will appear 10 seconds into the RCU-tasks grace period, the second 40 seconds in, and the third 160 seconds in. There would be a fourth at 640 seconds in, but the stall warning message appears 600 seconds in, and once a stall warning is printed for a given grace period, no further informational messages are printed. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 30 +++++++++++++++++-- kernel/rcu/tasks.h | 40 +++++++++++++++++++++---- 2 files changed, 62 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..babc701d4864 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4955,10 +4955,34 @@ number avoids disturbing real-time workloads, but lengthens grace periods. + rcupdate.rcu_task_stall_info= [KNL] + Set initial timeout in jiffies for RCU task stall + informational messages, which give some indication + of the problem for those not patient enough to + wait for ten minutes. Informational messages are + only printed prior to the stall-warning message + for a given grace period. Disable with a value + less than or equal to zero. Defaults to ten + seconds. A change in value does not take effect + until the beginning of the next grace period. + + rcupdate.rcu_task_stall_info_mult= [KNL] + Multiplier for time interval between successive + RCU task stall informational messages for a given + RCU tasks grace period. This value is clamped + to one through ten, inclusive. It defaults to + the value three, so that the first informational + message is printed 10 seconds into the grace + period, the second at 40 seconds, the third at + 160 seconds, and then the stall warning at 600 + seconds would prevent a fourth at 640 seconds. + rcupdate.rcu_task_stall_timeout= [KNL] - Set timeout in jiffies for RCU task stall warning - messages. Disable with a value less than or equal - to zero. + Set timeout in jiffies for RCU task stall + warning messages. Disable with a value less + than or equal to zero. Defaults to ten minutes. + A change in value does not take effect until + the beginning of the next grace period. rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b43320b149d2..76799c81d4be 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644); #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -548,8 +553,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -592,10 +604,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); @@ -621,6 +638,8 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; @@ -630,7 +649,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); schedule_timeout_idle(fract); @@ -639,12 +658,23 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); -- cgit v1.2.3 From 88db792bbe9b140680c74e9f2f801ac00f54e05e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Mar 2022 12:07:25 +0100 Subject: rcu-tasks: Use rcuwait for the rcu_tasks_kthread() The waitqueue used by rcu_tasks_kthread() has always only one waiter. With a guaranteed only one waiter, this can be replaced with rcuwait which is smaller and simpler. With rcuwait based wake counterpart, the irqwork function (call_rcu_tasks_iw_wakeup()) can be invoked hardirq context because it is only a wake up and no sleeping locks are involved (unlike the wait_queue_head). As a side effect, this is also one piece of the puzzle to pass the RCU selftest at early boot on PREEMPT_RT. Replace wait_queue_head with rcuwait and let the irqwork run in hardirq context on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 76799c81d4be..4b91cb214ca7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,7 +46,7 @@ struct rcu_tasks_percpu { /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -77,7 +77,7 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; @@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ .gp_func = gp, \ .call_func = call, \ @@ -266,7 +266,7 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. @@ -514,7 +514,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); if (needgpcb & 0x2) { // Wait for one grace period. -- cgit v1.2.3 From 5d90070816534882b9158f14154b7e2cdef1194a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Mar 2022 10:41:44 -0800 Subject: rcu-tasks: Make Tasks RCU account for userspace execution The main Tasks RCU quiescent state is voluntary context switch. However, userspace execution is also a valid quiescent state, and is a valuable one for userspace applications that spin repeatedly executing light-weight non-sleeping system calls. Currently, such an application can delay a Tasks RCU grace period for many tens of seconds. This commit therefore enlists the aid of the scheduler-clock interrupt to provide a Tasks RCU quiescent state when it interrupted a task executing in userspace. [ paulmck: Apply feedback from kernel test robot. ] Cc: Martin KaFai Lau Cc: Neil Spring Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/tree.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e7c39c200e2b..1a32036c918c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -196,6 +196,7 @@ void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +#define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..8dbfb63f0391 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2624,6 +2624,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user) + rcu_tasks_classic_qs(current, false); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); -- cgit v1.2.3 From 777570d9ef820e470736fa9e02b8e3e48891c050 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 8 Mar 2022 09:54:13 -0800 Subject: rcu-tasks: Use schedule_hrtimeout_range() to wait for grace periods The synchronous RCU-tasks grace-period-wait primitives invoke schedule_timeout_idle() to give readers a chance to exit their read-side critical sections. Unfortunately, this fails during early boot on PREEMPT_RT because PREEMPT_RT relies solely on ksoftirqd to run timer handlers. Because ksoftirqd cannot operate until its kthreads are spawned, there is a brief period of time following scheduler initialization where PREEMPT_RT cannot run the timer handlers that schedule_timeout_idle() relies on, resulting in a hang. To avoid this boot-time hang, this commit replaces schedule_timeout_idle() with schedule_hrtimeout(), so that the timer expires in hardirq context. This is ensures that the timer fires even on PREEMPT_RT throughout the irqs-enabled portions of boot as well as during runtime. The timer is set to expire between fract and fract + HZ / 2 jiffies in order to align with any other timers that might expire during that time, thus reducing the number of wakeups. Note that RCU-tasks grace periods are infrequent, so the use of hrtimer should be fine. In contrast, in common-case code, user of hrtimer could result in performance issues. Cc: Martin KaFai Lau Cc: Andrii Nakryiko Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4b91cb214ca7..71fe340ab82a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -647,13 +647,16 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); if (fract < HZ) fract++; -- cgit v1.2.3 From bddf7122f7e321d5a677a695e8597064d987482c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Mar 2022 08:08:14 -0700 Subject: rcu-tasks: Restore use of timers for non-RT kernels The use of hrtimers for RCU-tasks grace-period delays works well in general, but can result in excessive grace-period delays for some corner-case workloads. This commit therefore reverts to the use of timers for non-RT kernels to mitigate those grace-period delays. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 71fe340ab82a..405614039515 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -654,9 +654,13 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - exp = jiffies_to_nsecs(fract); - __set_current_state(TASK_IDLE); - schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; -- cgit v1.2.3 From 10b3742f939c51d53619a31a5c03055c5e0952b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2022 16:00:05 -0700 Subject: rcu-tasks: Make show_rcu_tasks_generic_gp_kthread() check all CPUs Currently, the show_rcu_tasks_generic_gp_kthread() function only looks at CPU 0's callback lists. Although this is not fatal, it can confuse debugging efforts in cases where any of the Tasks RCU flavors are in per-CPU queueing mode. This commit therefore causes this function to scan all CPUs' callback queues. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 405614039515..3aad0dfbfaf4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -582,7 +582,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -590,7 +600,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From 07d95c34e8125a7bf833a94bc3c9d51992d92c45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Apr 2022 12:30:18 -0700 Subject: rcu-tasks: Handle sparse cpu_possible_mask If the rcupdate.rcu_task_enqueue_lim kernel boot parameter is set to something greater than 1 and less than nr_cpu_ids, the code attempts to use a subset of the CPU's RCU Tasks callback lists. This works, but only if the cpu_possible_mask is contiguous. If there are "holes" in this mask, the callback-enqueue code might attempt to access a non-existent per-CPU ->rtcpu variable for a non-existent CPU. For example, if only CPUs 0, 4, 8, 12, 16 and so on are in cpu_possible_mask, specifying rcupdate.rcu_task_enqueue_lim=4 would cause the code to attempt to use callback queues for non-existent CPUs 1, 2, and 3. Because such systems have existed in the past and might still exist, the code needs to gracefully handle this situation. This commit therefore checks to see whether the desired CPU is present in cpu_possible_mask, and, if not, searches for the next CPU. This means that the systems administrator of a system with a sparse cpu_possible_mask will need to account for this sparsity when specifying the value of the rcupdate.rcu_task_enqueue_lim kernel boot parameter. For example, setting this parameter to the value 4 will use only CPUs 0 and 4, which CPU 4 getting three times the callback load of CPU 0. This commit assumes that bit (nr_cpu_ids - 1) is always set in cpu_possible_mask. Link: https://lore.kernel.org/lkml/CANn89iKaNEwyNZ=L_PQnkH0LP_XjLYrr_dpyRKNNoDJaWKdrmg@mail.gmail.com/ Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3aad0dfbfaf4..fd70d86eb7cd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -273,7 +273,9 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -283,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; -- cgit v1.2.3 From ab2756ea6b74987849b44ad0e33c3cfec159033b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Apr 2022 09:21:50 -0700 Subject: rcu-tasks: Handle sparse cpu_possible_mask in rcu_tasks_invoke_cbs() If the cpu_possible_mask is sparse (for example, if bits are set only for CPUs 0, 4, 8, ...), then rcu_tasks_invoke_cbs() will access per-CPU data for a CPU not in cpu_possible_mask. It makes these accesses while doing a workqueue-based binary search for non-empty callback lists. Although this search must pass through CPUs not represented in cpu_possible_mask, it has no need to check the callback list for such CPUs. This commit therefore changes the rcu_tasks_invoke_cbs() function's binary search so as to only check callback lists for CPUs present in cpu_possible_mask. Reported-by: Eric Dumazet Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd70d86eb7cd..3925e32159b5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -468,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); -- cgit v1.2.3 From 99d6a2acb8955f12489bfba04f2db22bc0b57726 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Feb 2022 12:45:18 -0800 Subject: rcutorture: Suppress debugging grace period delays during flooding Tree RCU supports grace-period delays using the rcutree.gp_cleanup_delay, rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot parameters. These delays are strictly for debugging purposes, and have proven quite effective at exposing bugs involving race with CPU-hotplug operations. However, these delays can result in false positives when used in conjunction with callback flooding, for example, those generated by the rcutorture.fwd_progress kernel boot parameter. This commit therefore suppresses grace-period delays while callback flooding is in progress. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++++ kernel/rcu/rcutorture.c | 4 ++++ kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++++--- 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..7a221393fcdb 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -523,6 +523,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -535,6 +537,8 @@ void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..f37b7a01dcd0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2916,10 +2916,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -3016,6 +3018,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3320,6 +3323,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..db67dae8ed88 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1705,11 +1705,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } -- cgit v1.2.3 From 8106bddbab5f0ba180e6d693c7c1fc6926d57caa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Feb 2022 17:40:49 -0800 Subject: scftorture: Fix distribution of short handler delays The scftorture test module's scf_handler() function is supposed to provide three different distributions of short delays (including "no delay") and one distribution of long delays, if specified by the scftorture.longwait module parameter. However, the second of the two non-zero-wait short delays is disabled due to the first such delay's "goto out" not being enclosed in the "then" clause with the "udelay()". This commit therefore adjusts the code to provide the intended set of delays. Fixes: e9d338a0b179 ("scftorture: Add smp_call_function() torture test") Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; -- cgit v1.2.3 From 39b3cab92d3754e18b1f9b5de8158642145b2405 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 7 Mar 2022 14:46:55 -0800 Subject: rcutorture: Avoid corner-case #DE with nsynctypes check The rcutorture module is used to run torture tests that validate RCU. rcutorture takes a variety of module parameters that configure the functionality of the test. Amongst these parameters are the types of synchronization mechanisms that the rcu_torture_writer and rcu_torture_fakewriter tasks may use, and the torture_type of the run which determines what read and sync operations are used by the various writer and reader tasks that run throughout the test. When the module is configured to only use sync types for which the specified torture_type does not implement the necessary operations, we can end up in a state where nsynctypes is 0. This is not an erroneous state, but it currently crashes the kernel with a #DE due to nsynctypes being used with a modulo operator in rcu_torture_fakewriter(). Here is an example of such a #DE: $ insmod ./rcutorture.ko gp_cond=1 gp_cond_exp=0 gp_exp=0 gp_poll_exp=0 gp_normal=0 gp_poll=0 gp_poll_exp=0 verbose=9999 torture_type=trivial ... [ 8536.525096] divide error: 0000 [#1] PREEMPT SMP PTI [ 8536.525101] CPU: 30 PID: 392138 Comm: rcu_torture_fak Kdump: loaded Tainted: G S 5.17.0-rc1-00179-gc8c42c80febd #24 [ 8536.525105] Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 [ 8536.525106] RIP: 0010:rcu_torture_fakewriter+0xf1/0x2d0 [rcutorture] [ 8536.525121] Code: 00 31 d2 8d 0c f5 00 00 00 00 48 63 c9 48 f7 f1 48 85 d2 0f 84 79 ff ff ff 48 89 e7 e8 78 78 01 00 48 63 0d 29 ca 00 00 31 d2 <48> f7 f1 8b 04 95 00 05 4e a0 83 f8 06 0f 84 ad 00 00 00 7f 1f 83 [ 8536.525124] RSP: 0018:ffffc9000777fef0 EFLAGS: 00010246 [ 8536.525127] RAX: 00000000223d006e RBX: cccccccccccccccd RCX: 0000000000000000 [ 8536.525130] RDX: 0000000000000000 RSI: ffffffff824315b9 RDI: ffffc9000777fef0 [ 8536.525132] RBP: ffffc9000487bb30 R08: 0000000000000002 R09: 000000000002a580 [ 8536.525134] R10: ffffffff82c5f920 R11: 0000000000000000 R12: ffff8881a2c35d00 [ 8536.525136] R13: ffff8881540c8d00 R14: ffffffffa04d39d0 R15: 0000000000000000 [ 8536.525137] FS: 0000000000000000(0000) GS:ffff88903ff80000(0000) knlGS:0000000000000000 [ 8536.525140] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8536.525142] CR2: 00007f839f022000 CR3: 0000000002c0a006 CR4: 00000000007706e0 [ 8536.525144] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8536.525145] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8536.525147] PKRU: 55555554 [ 8536.525148] Call Trace: [ 8536.525150] [ 8536.525153] kthread+0xe8/0x110 [ 8536.525161] ? kthread_complete_and_exit+0x20/0x20 [ 8536.525167] ret_from_fork+0x22/0x30 [ 8536.525174] The solution is to gracefully handle the case of nsynctypes being 0 in rcu_torture_fakewriter() by not performing any work. This is already being done in rcu_torture_writer(), though there is a missing return on that path which will be fixed in a subsequent patch. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f37b7a01dcd0..d5105fb6c980 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1322,6 +1322,17 @@ rcu_torture_fakewriter(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && -- cgit v1.2.3 From 80dcee695143255261f30c7cc2a041ba413717a4 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 7 Mar 2022 14:46:57 -0800 Subject: rcutorture: Add missing return and use __func__ in warning The rcutorture module has an rcu_torture_writer task that repeatedly performs writes, synchronizations, and deletes. There is a corner-case check in rcu_torture_writer() wherein if nsynctypes is 0, a warning is issued and the task waits to be stopped via a call to torture_kthread_stopping() rather than performing any work. There should be a return statement following this call to torture_kthread_stopping(), as the intention with issuing the call to torture_kthread_stopping() in the first place is to avoid the rcu_torture_writer task from performing any work. Some of the work may even be dangerous to perform, such as potentially causing a #DE due to nsynctypes being used in a modulo operator when querying for sync updates to issue. This patch adds the missing return call. As a bonus, it also fixes a checkpatch warning that was emitted due to the WARN_ONCE() call using the name of the function rather than __func__. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d5105fb6c980..f1292d9e86b5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1178,7 +1178,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1186,6 +1186,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { -- cgit v1.2.3 From 46e861be589881e0905b9ade3d8439883858721c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 09:30:10 -0700 Subject: rcu: Make TASKS_RUDE_RCU select IRQ_WORK The TASKS_RUDE_RCU does not select IRQ_WORK, which can result in build failures for kernels that do not otherwise select IRQ_WORK. This commit therefore causes the TASKS_RUDE_RCU Kconfig option to select IRQ_WORK. Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..f559870fbf8b 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -86,6 +86,7 @@ config TASKS_RCU config TASKS_RUDE_RCU def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses only context switch (including preemption) and user-mode -- cgit v1.2.3 From d22959aa93528c6cf4583560696856cf6bba6b72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Feb 2022 09:10:04 -0800 Subject: rcu: Clarify fill-the-gap comment in rcu_segcblist_advance() Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) -- cgit v1.2.3 From 90d2efe7bdbde5371b6122174af0718843f805c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Feb 2022 09:54:56 -0800 Subject: rcu: Fix rcu_preempt_deferred_qs_irqrestore() strict QS reporting Suppose we have a kernel built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y and CONFIG_PREEMPT=y. Suppose further that an RCU reader from which RCU core needs a quiescent state ends in rcu_preempt_deferred_qs_irqrestore(). This function will then invoke rcu_report_qs_rdp() in order to immediately report that quiescent state. Unfortunately, it will not have cleared that reader's CPU's rcu_data structure's ->cpu_no_qs.b.norm field. As a result, rcu_report_qs_rdp() will take an early exit because it will believe that this CPU has not yet encountered a quiescent state, and there will be no reporting of the current quiescent state. This commit therefore causes rcu_preempt_deferred_qs_irqrestore() to clear the ->cpu_no_qs.b.norm field before invoking rcu_report_qs_rdp(). Kudos to Boqun Feng and Neeraj Upadhyay for helping with analysis of this issue! Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..176639c6215f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { -- cgit v1.2.3 From c708b08c65a0dfae127b9ee33b0fb73535a5e066 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Feb 2022 17:29:37 -0800 Subject: rcu: Check for jiffies going backwards A report of a 12-jiffy normal RCU CPU stall warning raises interesting questions about the nature of time on the offending system. This commit instruments rcu_sched_clock_irq(), which is RCU's hook into the scheduling-clock interrupt, checking for the jiffies counter going backwards. Reported-by: Saravanan D Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..a5ea67454640 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -2609,6 +2611,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -4179,6 +4188,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..94b55f669915 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; -- cgit v1.2.3 From 3791a22374715b36ad806db13d8b2afb1b57fd36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Feb 2022 18:08:33 -0800 Subject: kernel/smp: Provide boot-time timeout for CSD lock diagnostics Debugging of problems involving insanely long-running SMI handlers proceeds better if the CSD-lock timeout can be adjusted. This commit therefore provides a new smp.csd_lock_timeout kernel boot parameter that specifies the timeout in milliseconds. The default remains at the previously hard-coded value of five seconds. [ paulmck: Apply feedback from Juergen Gross. ] Cc: Rik van Riel Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Reviewed-by: Juergen Gross Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 11 +++++++++++ kernel/smp.c | 7 +++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..645c4c001b16 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5377,6 +5377,17 @@ smart2= [HW] Format: [,[,...,]] + smp.csd_lock_timeout= [KNL] + Specify the period of time in milliseconds + that smp_call_function() and friends will wait + for a CPU to release the CSD lock. This is + useful when diagnosing bugs involving CPUs + disabling interrupts for extended periods + of time. Defaults to 5,000 milliseconds, and + setting a value of zero disables this feature. + This feature may be more efficiently disabled + using the csdlock_debug- kernel parameter. + smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port smsc-ircc2.ircc_sir= [HW] SIR base I/O port diff --git a/kernel/smp.c b/kernel/smp.c index 01a7c1706a58..6a1f1daa3dc4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; -- cgit v1.2.3 From 75182a4eaaf8b697f66d68ad039f021f461dd2a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Mar 2022 11:01:37 -0800 Subject: rcu: Add comments to final rcu_gp_cleanup() "if" statement The final "if" statement in rcu_gp_cleanup() has proven to be rather confusing, straightforward though it might have seemed when initially written. This commit therefore adds comments to its "then" and "else" clauses to at least provide a more elevated form of confusion. Reported-by: Boqun Feng Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Reported-by: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a5ea67454640..29669070348e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2098,14 +2098,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period).  The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist.  We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags.  + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); -- cgit v1.2.3 From 80d530b47da41642fab317a9485d58dfbe1e8896 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Mar 2022 10:10:35 -0800 Subject: rcu: Print number of online CPUs in RCU CPU stall-warning messages RCU's synchronous grace periods act quite differently when there is only one online CPU, especially in the no-op case in kernels built with CONFIG_PREEMPTION=n. This change in behavior can be important debugging information, so this commit adds the number of online CPUs to the RCU CPU stall warning messages. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..268dd79c58e7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -565,9 +565,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -626,9 +626,9 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); -- cgit v1.2.3 From 70ae7b0ce03347fab35d6d8df81e1165d7ea8045 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Mar 2022 14:37:38 +0100 Subject: rcu: Fix preemption mode check on synchronize_rcu[_expedited]() An early check on synchronize_rcu[_expedited]() tries to determine if the current CPU is in UP mode on an SMP no-preempt kernel, in which case there is no need to start a grace period since the current assumed quiescent state is all we need. However the preemption mode doesn't take into account the boot selected preemption mode under CONFIG_PREEMPT_DYNAMIC=y, missing a possible early return if the running flavour is "none" or "voluntary". Use the shiny new preempt mode accessors to fix this. However, avoid invoking them during early boot because doing so triggers a WARN_ON_ONCE(). [ paulmck: Update for mainlined API. ] Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Valentin Schneider Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 29669070348e..d3caa82b9954 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3741,7 +3741,9 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPTION)) + // Invoking preempt_model_*() too early gets a splat. + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || + preempt_model_full() || preempt_model_rt()) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); -- cgit v1.2.3 From 88ca472f80604c070526eb58b977ea0a9c3c2e1f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 24 Mar 2022 19:15:15 +0800 Subject: rcu: Check for successful spawn of ->boost_kthread_task For the spawning of the priority-boost kthreads can fail, improbable though this might seem. This commit therefore refrains from attemoting to initiate RCU priority boosting when The ->boost_kthread_task pointer is NULL. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 176639c6215f..5c23aceecd62 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1125,7 +1125,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } -- cgit v1.2.3 From f1efe84d6fd2af163989025bc285ff9b3b0c764f Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 29 Mar 2022 15:26:13 -0700 Subject: rcu_sync: Fix comment to properly reflect rcu_sync_exit() behavior The rcu_sync_enter() function is used by updaters to force RCU readers (e.g. percpu-rwsem) to use their slow paths during an update. This is accomplished by setting the ->gp_state of the rcu_sync structure to GP_ENTER. In the case of percpu-rwsem, the readers' slow path waits on a semaphore instead of just incrementing a reader count. Each updater invokes the rcu_sync_exit() function to signal to readers that they may again take their fastpaths. The rcu_sync_exit() function sets the ->gp_state of the rcu_sync structure to GP_EXIT, and if all goes well, after a grace period the ->gp_state reverts back to GP_IDLE. Unfortunately, the rcu_sync_enter() function currently has a comment incorrectly stating that rcu_sync_exit() (by an updater) will re-enable reader "slowpaths". This patch changes the comment to state that this function re-enables reader fastpaths. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can -- cgit v1.2.3 From f596e2ce1c0f250bb3ecc179f611be37e862635f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 4 Apr 2022 07:59:32 +0800 Subject: rcu: Use IRQ_WORK_INIT_HARD() to avoid rcu_read_unlock() hangs When booting kernels built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y and CONFIG_PREEMPT_RT=y, the rcu_read_unlock_special() function's invocation of irq_work_queue_on() the init_irq_work() causes the rcu_preempt_deferred_qs_handler() function to work execute in SCHED_FIFO irq_work kthreads. Because rcu_read_unlock_special() is invoked on each rcu_read_unlock() in such kernels, the amount of work just keeps piling up, resulting in a boot-time hang. This commit therefore avoids this hang by using IRQ_WORK_INIT_HARD() instead of init_irq_work(), but only in kernels built with both CONFIG_PREEMPT_RT=y and CONFIG_RCU_STRICT_GRACE_PERIOD=y. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5c23aceecd62..2a3715419073 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -661,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } -- cgit v1.2.3 From 835f14ed53076384f0e1dad2fddb4881315f124f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 11:05:09 -0700 Subject: rcu: Make the TASKS_RCU Kconfig option be selected Currently, any kernel built with CONFIG_PREEMPTION=y also gets CONFIG_TASKS_RCU=y, which is not helpful to people trying to build preemptible kernels of minimal size. Because CONFIG_TASKS_RCU=y is needed only in kernels doing tracing of one form or another, this commit moves from TASKS_RCU deciding when it should be enabled to the tracing Kconfig options explicitly selecting it. This allows building preemptible kernels without TASKS_RCU, if desired. This commit also updates the SRCU-N and TREE09 rcutorture scenarios in order to avoid Kconfig errors that would otherwise result from CONFIG_TASKS_RCU being selected without its CONFIG_RCU_EXPERT dependency being met. [ paulmck: Apply BPF_SYSCALL feedback from Andrii Nakryiko. ] Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Zhouyi Zhou Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Steven Rostedt Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney --- arch/Kconfig | 1 + kernel/bpf/Kconfig | 1 + kernel/rcu/Kconfig | 3 ++- kernel/trace/Kconfig | 1 + tools/testing/selftests/rcutorture/configs/rcu/SRCU-N | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 2 ++ 6 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 29b0167c088b..1bf29ce754af 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -35,6 +35,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index f559870fbf8b..4f665ae0cf55 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -78,7 +78,8 @@ config TASKS_RCU_GENERIC task-based RCU implementations. Not for manual selection. config TASKS_RCU - def_bool PREEMPTION + def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..bf5da6c4e999 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -144,6 +144,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N index 2da8b49589a0..07f5e0a70ae7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N @@ -6,3 +6,5 @@ CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index 8523a7515cbf..fc45645bb5f4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -13,3 +13,5 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n -- cgit v1.2.3 From 40c1278aa7cd51d4f8627f7adc66aa73e01aff81 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 13:29:59 -0700 Subject: rcutorture: Allow rcutorture without RCU Tasks Trace Unless a kernel builds rcutorture, whether built-in or as a module, that kernel is also built with CONFIG_TASKS_TRACE_RCU, whether anything else needs Tasks Trace RCU or not. This unnecessarily increases kernel size. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks Trace. However, there is a need to select CONFIG_TASKS_TRACE_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_TRACE_RCU that selects CONFIG_TASKS_TRACE_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 22 +++-- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 101 ++++++++++++--------- .../selftests/rcutorture/configs/rcu/TRACE01 | 2 + .../selftests/rcutorture/configs/rcu/TRACE02 | 2 + 5 files changed, 75 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4f665ae0cf55..2befd328e6a0 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -95,15 +95,23 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. -config TASKS_TRACE_RCU - def_bool 0 - select IRQ_WORK +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. + +config TASKS_TRACE_RCU + bool + default n + select IRQ_WORK config RCU_STALL_COMMON def_bool TREE_RCU @@ -227,7 +235,7 @@ config RCU_NOCB_CPU config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..d7f4bb1c4979 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -49,7 +49,6 @@ config RCU_TORTURE_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..7dd3e14ec907 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -737,6 +737,48 @@ static struct rcu_torture_ops busted_srcud_ops = { .name = "busted_srcud" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + /* * Definitions for RCU-tasks torture testing. */ @@ -780,48 +822,6 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ - -static void synchronize_rcu_trivial(void) -{ - int cpu; - - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} - -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} - -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} - -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; - /* * Definitions for rude RCU-tasks torture testing. */ @@ -851,6 +851,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -893,6 +895,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -3096,9 +3107,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + &tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index e4d74e5fc1d0..0f5605ed1e48 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -7,5 +7,7 @@ CONFIG_PREEMPT=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 77541eeb4e9f..093ea6e8e65c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -7,5 +7,7 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y -- cgit v1.2.3 From 3b6e1dd42317ec366dab3205f99280e2ab1ad85a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 15:18:27 -0700 Subject: rcutorture: Allow rcutorture without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds rcutorture, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) used. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks. However, there is a need to select CONFIG_TASKS_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_RCU that selects CONFIG_TASKS_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 18 +++++++++++++----- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 13 ++++++++++++- tools/testing/selftests/rcutorture/configs/rcu/TASKS01 | 1 + tools/testing/selftests/rcutorture/configs/rcu/TASKS02 | 3 +++ tools/testing/selftests/rcutorture/configs/rcu/TASKS03 | 2 ++ 6 files changed, 31 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 2befd328e6a0..8eac165db09f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -77,13 +77,21 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + config TASKS_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. config TASKS_RUDE_RCU def_bool 0 diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index d7f4bb1c4979..c217a5e655a4 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -47,7 +47,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU default n help diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7dd3e14ec907..65d045ff9766 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -779,6 +779,8 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks torture testing. */ @@ -822,6 +824,15 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else #ifdef CONFIG_TASKS_RCU + + /* * Definitions for rude RCU-tasks torture testing. */ @@ -3108,7 +3119,7 @@ rcu_torture_init(void) unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, - &tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS + TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS &trivial_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 index 3ca112444ce7..d84801b9a7ae 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 @@ -7,4 +7,5 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_RCU=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 index ad2be91e5ee7..d333b69bc831 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 @@ -2,3 +2,6 @@ CONFIG_SMP=n CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +#CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index dc02083803ce..dea26c568678 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -7,3 +7,5 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_TASKS_RCU=y +CONFIG_RCU_EXPERT=y -- cgit v1.2.3 From 4c3f7b0e1e880e892d4bc4f50bf627b251b6e2cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 16:16:45 -0700 Subject: rcutorture: Allow rcutorture without RCU Tasks Rude Unless a kernel builds rcutorture, whether built-in or as a module, that kernel is also built with CONFIG_TASKS_RUDE_RCU, whether anything else needs Tasks Rude RCU or not. This unnecessarily increases kernel size. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks Rude. However, there is a need to select CONFIG_TASKS_RUDE_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_RUDE_RCU that selects CONFIG_TASKS_RUDE_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 21 ++++++++++++++------- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 13 ++++++++++++- .../testing/selftests/rcutorture/configs/rcu/RUDE01 | 2 ++ 4 files changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 8eac165db09f..65d45c00fd1b 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -93,15 +93,22 @@ config TASKS_RCU default n select IRQ_WORK +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. + config TASKS_RUDE_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. config FORCE_TASKS_TRACE_RCU bool "Force selection of Tasks Trace RCU" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index c217a5e655a4..f4a4468cbf03 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -47,7 +47,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 65d045ff9766..d528245108c2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -833,6 +833,8 @@ static struct rcu_torture_ops tasks_ops = { #endif // #else #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + /* * Definitions for rude RCU-tasks torture testing. */ @@ -862,6 +864,15 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -3119,7 +3130,7 @@ rcu_torture_init(void) unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, - TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS &trivial_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 index 7093422050f6..6fd6acb94518 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -8,3 +8,5 @@ CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_RCU_EXPERT=y +CONFIG_FORCE_TASKS_RUDE_RCU=y +#CHECK#CONFIG_TASKS_RUDE_RCU=y -- cgit v1.2.3 From 5f654af150fd5aeb9fff138c7cbd72cea016b863 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 14:39:54 -0700 Subject: refscale: Allow refscale without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds refscale, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of refscale from the presence of RCU Tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/refscale.c | 12 +++++++++++- tools/testing/selftests/rcutorture/configs/refscale/CFcommon | 2 ++ .../testing/selftests/rcutorture/configs/refscale/NOPREEMPT | 2 ++ 4 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index f4a4468cbf03..454924e03ef3 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -63,7 +63,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU select TASKS_TRACE_RCU default n diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..5079e47b3d18 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,14 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -790,7 +800,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon index a98b58b54bb1..14fdafc576ce 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -1,2 +1,4 @@ CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT index 7f06838a91e6..ef2b501a6971 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -15,3 +15,5 @@ CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n -- cgit v1.2.3 From dec86781a54f4a527386a0b86b22e99e2ac67a09 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 15:21:07 -0700 Subject: refscale: Allow refscale without RCU Tasks Rude/Trace Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks Rude and RCU Tasks Trace. Unless that kernel builds refscale, whether built-in or as a module, in which case these RCU Tasks flavors are (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of refscale from the presence of RCU Tasks Rude and RCU Tasks Trace. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 -- kernel/rcu/refscale.c | 12 +++++++++++- tools/testing/selftests/rcutorture/configs/refscale/CFcommon | 2 ++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 454924e03ef3..dceaa3e754e5 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -63,8 +63,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5079e47b3d18..909644abee67 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -242,6 +242,8 @@ static struct ref_scale_ops rcu_tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -271,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -800,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon index 14fdafc576ce..fbea3b13baba 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -2,3 +2,5 @@ CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y CONFIG_FORCE_TASKS_RCU=y #CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y -- cgit v1.2.3 From 4df002d908796c1ff87b985af1d31a0e36e6c66f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 16:39:01 -0700 Subject: rcuscale: Allow rcuscale without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds rcuscale, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) built. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcuscale from the presence of RCU Tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcuscale.c | 12 +++++++++++- tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon | 4 ++-- tools/testing/selftests/rcutorture/configs/rcuscale/TREE | 2 ++ 4 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index dceaa3e754e5..71e73fceff87 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,7 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU select TASKS_TRACE_RCU default n diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..311dbcb064ed 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,14 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -797,7 +807,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops }; if (!torture_init_begin(scale_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 90942bb5bebc..2ed3b46a9c37 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,5 +1,5 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_TRACE_RCU=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE index f110d9ffbe4c..b10706fd03a4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE @@ -16,3 +16,5 @@ CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_TRACE=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n -- cgit v1.2.3 From 5ce027f4cd0e2f28ea5574ede9eef290e2ede5c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 17:05:40 -0700 Subject: rcuscale: Allow rcuscale without RCU Tasks Rude/Trace Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks Rude and RCU Tasks Trace. Unless that kernel builds rcuscale, whether built-in or as a module, in which case these RCU Tasks flavors are (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcuscale from the presence of RCU Tasks Rude and RCU Tasks Trace. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 -- kernel/rcu/rcuscale.c | 12 +++++++++++- tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 71e73fceff87..68092e1db64b 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,8 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 311dbcb064ed..277a5bfb37d4 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -305,6 +305,8 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -334,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -807,7 +817,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 2ed3b46a9c37..6a00157bee5b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,5 +1,6 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_TASKS_RCU_GENERIC=y CONFIG_FORCE_TASKS_RCU=y #CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y -- cgit v1.2.3 From a57ffb3c6b67e59e8632f731414b792eacc6cca0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Jan 2022 11:21:30 -0800 Subject: srcu: Automatically determine size-transition strategy at boot This commit adds a srcutree.convert_to_big option of zero that causes SRCU to decide at boot whether to wait for contention (small systems) or immediately expand to large (large systems). A new srcutree.big_cpu_lim (defaulting to 128) defines how many CPUs constitute a large system. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++++ kernel/rcu/srcutree.c | 23 ++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 177e688768c0..0a094bb2d722 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5608,6 +5608,15 @@ off: Disable mitigation and remove performance impact to RDRAND and RDSEED + srcutree.big_cpu_lim [KNL] + Specifies the number of CPUs constituting a + large system, such that srcu_struct structures + should immediately allocate an srcu_node array. + This kernel-boot parameter defaults to 128, + but takes effect only when the low-order four + bits of srcutree.convert_to_big is equal to 3 + (decide at boot). + srcutree.convert_to_big [KNL] Specifies under what conditions an SRCU tree srcu_struct structure will be converted to big @@ -5616,6 +5625,7 @@ 0: Never. 1: At init_srcu_struct() time. 2: When rcutorture decides to. + 3: Decide at boot time (default). 0x1X: Above plus if high contention. Either way, the srcu_node tree will be sized based diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0bc6a0a3edee..b9dec26245e0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -41,10 +41,10 @@ module_param(counter_wrap_check, ulong, 0444); /* * Control conversion to SRCU_SIZE_BIG: - * 0: Don't convert at all (default). + * 0: Don't convert at all. * 1: Convert at init_srcu_struct() time. * 2: Convert when rcutorture invokes srcu_torture_stats_print(). - * 3: Decide at boot time based on system shape. + * 3: Decide at boot time based on system shape (default). * 0x1x: Convert when excessive contention encountered. */ #define SRCU_SIZING_NONE 0 @@ -57,9 +57,13 @@ module_param(counter_wrap_check, ulong, 0444); #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) -static int convert_to_big = SRCU_SIZING_NONE; +static int convert_to_big = SRCU_SIZING_AUTO; module_param(convert_to_big, int, 0444); +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + /* Contention events per jiffy to initiate transition to big. */ static int small_contention_lim __read_mostly = 100; module_param(small_contention_lim, int, 0444); @@ -1619,6 +1623,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1629,6 +1644,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } -- cgit v1.2.3 From c2445d38785086422e56dcbe049b73a53b2ba81f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Jan 2022 13:27:15 -0800 Subject: srcu: Add contention check to call_srcu() srcu_data ->lock acquisition This commit increases the sensitivity of contention detection by adding checks to the acquisition of the srcu_data structure's lock on the call_srcu() code path. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b9dec26245e0..862008c147b0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -330,18 +330,13 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) } /* - * Acquire the specified srcu_struct structure's ->lock, but check for - * excessive contention, which results in initiation of a transition - * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module - * parameter permits this. + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. */ -static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) - return; - spin_lock_irqsave_rcu_node(ssp, *flags); if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) return; j = jiffies; @@ -354,6 +349,38 @@ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned l __srcu_transition_to_big(ssp); } +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -1010,7 +1037,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp = per_cpu_ptr(ssp->sda, 0); else sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, -- cgit v1.2.3 From 282d8998e9979c2186af7f7d22366f2fc3149838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2022 15:45:33 -0800 Subject: srcu: Prevent expedited GPs and blocking readers from consuming CPU If an SRCU reader blocks while a synchronize_srcu_expedited() waits for that same reader, then that grace period will spawn an endless series of workqueue handlers, consuming a full CPU. This quickly gets pointless because consuming more CPU isn't going to make that reader get done faster, especially if it is blocked waiting for an external event. This commit therefore spawns at most one pair of back-to-back workqueue handlers per expedited grace period phase, instead inserting increasing delays as that grace period phase grows older, but capped at 10 jiffies. In any case, if there have been at least 100 back-to-back workqueue handlers within a single jiffy, regardless of grace period or grace-period phase, then a one-jiffy delay is inserted. [ paulmck: Apply feedback from kernel test robot. ] Cc: Neeraj Upadhyay Reported-by: Song Liu Tested-by: kernel test robot Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 4 ++++ kernel/rcu/srcutree.c | 44 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b9ff4ed37e4..e3014319d1ad 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -71,9 +71,11 @@ struct srcu_struct { unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ + unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ @@ -83,6 +85,8 @@ struct srcu_struct { atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ + unsigned long reschedule_jiffies; + unsigned long reschedule_count; struct delayed_work work; struct lockdep_map dep_map; }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 862008c147b0..6dd44e759f12 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -511,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. +#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* * Return grace-period delay, zero if there are expedited grace @@ -519,9 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { + unsigned long jbase = SRCU_INTERVAL; + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) + jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -623,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -706,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + cbdelay = !!srcu_get_delay(ssp); WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -893,7 +907,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -1448,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1462,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1552,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, -- cgit v1.2.3 From 586e31d59c436cda65a2e8ac04ff954bed247023 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 15 Mar 2022 09:55:49 +0100 Subject: srcu: Drop needless initialization of sdp in srcu_gp_start() Commit 9c7ef4c30f12 ("srcu: Make Tree SRCU able to operate without snp_node array") initializes the local variable sdp differently depending on the srcu's state in srcu_gp_start(). Either way, this initialization overwrites the value used when sdp is defined. This commit therefore drops this pointless definition-time initialization. Although there is no functional change, compiler code generation may be affected. Signed-off-by: Lukas Bulwahn Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6dd44e759f12..50ba70f019de 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) -- cgit v1.2.3 From 28b3ae426598e722cf5d5ab9cc7038791b955a56 Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Wed, 16 Feb 2022 14:52:09 +0100 Subject: rcu: Introduce CONFIG_RCU_EXP_CPU_STALL_TIMEOUT Currently both expedited and regular grace period stall warnings use a single timeout value that with units of seconds. However, recent Android use cases problem require a sub-100-millisecond expedited RCU CPU stall warning. Given that expedited RCU grace periods normally complete in far less than a single millisecond, especially for small systems, this is not unreasonable. Therefore introduce the CONFIG_RCU_EXP_CPU_STALL_TIMEOUT kernel configuration that defaults to 20 msec on Android and remains the same as that of the non-expedited stall warnings otherwise. It also can be changed in run-time via: /sys/.../parameters/rcu_exp_cpu_stall_timeout. [ paulmck: Default of zero to use CONFIG_RCU_STALL_TIMEOUT. ] Signed-off-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 20 ++++++++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++++ kernel/rcu/Kconfig.debug | 14 +++++++++++++ kernel/rcu/rcu.h | 2 ++ kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_stall.h | 28 +++++++++++++++++++++++++ kernel/rcu/update.c | 2 ++ 7 files changed, 80 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 78404625bad2..794837eb519b 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -162,6 +162,26 @@ CONFIG_RCU_CPU_STALL_TIMEOUT Stall-warning messages may be enabled and disabled completely via /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. +CONFIG_RCU_EXP_CPU_STALL_TIMEOUT +-------------------------------- + + Same as the CONFIG_RCU_CPU_STALL_TIMEOUT parameter but only for + the expedited grace period. This parameter defines the period + of time that RCU will wait from the beginning of an expedited + grace period until it issues an RCU CPU stall warning. This time + period is normally 20 milliseconds on Android devices. A zero + value causes the CONFIG_RCU_CPU_STALL_TIMEOUT value to be used, + after conversion to milliseconds. + + This configuration parameter may be changed at runtime via the + /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout, however + this parameter is checked only at the beginning of a cycle. If you + are in a current stall cycle, setting it to a new value will change + the timeout for the -next- stall. + + Stall-warning messages may be enabled and disabled completely via + /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. + RCU_STALL_DELAY_DELTA --------------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..5e21a3fb57c4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4893,6 +4893,18 @@ rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. + The value is in seconds and the maximum allowed + value is 300 seconds. + + rcupdate.rcu_exp_cpu_stall_timeout= [KNL] + Set timeout for expedited RCU CPU stall warning + messages. The value is in milliseconds + and the maximum allowed value is 21000 + milliseconds. Please note that this value is + adjusted to an arch timer tick resolution. + Setting this to zero causes the value from + rcupdate.rcu_cpu_stall_timeout to be used (after + conversion from seconds to milliseconds). rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..0b397b5bf846 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -91,6 +91,20 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 20 if ANDROID + default 0 if !ANDROID + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..20f0300f6cb1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..b1f52b59fa4b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -496,7 +496,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) @@ -571,7 +571,7 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..009d3f9305cf 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 180ff9c41fa8..fc7fef575606 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall -- cgit v1.2.3 From 9621fbee44df940e2e1b94b0676460a538dffefa Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Fri, 8 Apr 2022 17:35:27 -0700 Subject: rcu: Move expedited grace period (GP) work to RT kthread_worker Enabling CONFIG_RCU_BOOST did not reduce RCU expedited grace-period latency because its workqueues run at SCHED_OTHER, and thus can be delayed by normal processes. This commit avoids these delays by moving the expedited GP work items to a real-time-priority kthread_worker. This option is controlled by CONFIG_RCU_EXP_KTHREAD and disabled by default on PREEMPT_RT=y kernels which disable expedited grace periods after boot by unconditionally setting rcupdate.rcu_normal_after_boot=1. The results were evaluated on arm64 Android devices (6GB ram) running 5.10 kernel, and capturing trace data in critical user-level code. The table below shows the resulting order-of-magnitude improvements in synchronize_rcu_expedited() latency: ------------------------------------------------------------------------ | | workqueues | kthread_worker | Diff | ------------------------------------------------------------------------ | Count | 725 | 688 | | ------------------------------------------------------------------------ | Min Duration (ns) | 326 | 447 | 37.12% | ------------------------------------------------------------------------ | Q1 (ns) | 39,428 | 38,971 | -1.16% | ------------------------------------------------------------------------ | Q2 - Median (ns) | 98,225 | 69,743 | -29.00% | ------------------------------------------------------------------------ | Q3 (ns) | 342,122 | 126,638 | -62.98% | ------------------------------------------------------------------------ | Max Duration (ns) | 372,766,967 | 2,329,671 | -99.38% | ------------------------------------------------------------------------ | Avg Duration (ns) | 2,746,353 | 151,242 | -94.49% | ------------------------------------------------------------------------ | Standard Deviation (ns) | 19,327,765 | 294,408 | | ------------------------------------------------------------------------ The below table show the range of maximums/minimums for synchronize_rcu_expedited() latency from all experiments: ------------------------------------------------------------------------ | | workqueues | kthread_worker | Diff | ------------------------------------------------------------------------ | Total No. of Experiments | 25 | 23 | | ------------------------------------------------------------------------ | Largest Maximum (ns) | 372,766,967 | 2,329,671 | -99.38% | ------------------------------------------------------------------------ | Smallest Maximum (ns) | 38,819 | 86,954 | 124.00% | ------------------------------------------------------------------------ | Range of Maximums (ns) | 372,728,148 | 2,242,717 | | ------------------------------------------------------------------------ | Largest Minimum (ns) | 88,623 | 27,588 | -68.87% | ------------------------------------------------------------------------ | Smallest Minimum (ns) | 326 | 447 | 37.12% | ------------------------------------------------------------------------ | Range of Minimums (ns) | 88,297 | 27,141 | | ------------------------------------------------------------------------ Cc: "Paul E. McKenney" Cc: Tejun Heo Reported-by: Tim Murray Reported-by: Wei Wang Tested-by: Kyle Lin Tested-by: Chunwei Lu Tested-by: Lulu Wang Signed-off-by: Kalesh Singh Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 14 +++++ kernel/rcu/rcu.h | 5 ++ kernel/rcu/tree.c | 51 ++++++++++++++++-- kernel/rcu/tree.h | 5 ++ kernel/rcu/tree_exp.h | 147 +++++++++++++++++++++++++++++++++++++++----------- 5 files changed, 188 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..fd64a75823cb 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -195,6 +195,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 20f0300f6cb1..e27bf7d1e3a4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..763e45fdf49b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4811,8 +4857,7 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..b577cdfdc851 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index b1f52b59fa4b..0f70f62039a9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.2.3