From 5257514d8885b2ebc11bf359ea1527282b47a5fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 11:03:39 -0700 Subject: rcu: Make expedited grace period use direct call on last leaf During expedited grace-period initialization, a work item is scheduled for each leaf rcu_node structure. However, that initialization code is itself (normally) executing from a workqueue, so one of the leaf rcu_node structures could just as well be handled by that pre-existing workqueue, and with less overhead. This commit therefore uses a shiny new rcu_is_leaf_node() macro to execute the last leaf rcu_node structure's initialization directly from the pre-existing workqueue. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 3 +++ kernel/rcu/tree_exp.h | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 40cea6735c2d..db0870acfdff 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -276,6 +276,9 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) +/* Is this rcu_node the last leaf? */ +#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..c6385ee1af65 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -486,8 +486,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rnp->rew.rew_func = func; rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || - rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - /* No workqueues yet. */ + rcu_scheduler_active != RCU_SCHEDULER_RUNNING || + rcu_is_last_leaf_node(rsp, rnp)) { + /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } -- cgit v1.2.3 From 5ef98a6328a1506f544a64b28d6a8a7b99af475b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 21:30:13 -0700 Subject: srcu: Fix typos in __call_srcu() header comment This commit simply changes some copy-pasta call_rcu() instances to the correct call_srcu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b4123d7a2cec..615e414b0c1e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -823,17 +823,17 @@ static void srcu_leak_callback(struct rcu_head *rhp) * more than one CPU, this means that when "func()" is invoked, each CPU * is guaranteed to have executed a full memory barrier since the end of * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing + * preceded the call to call_srcu(). It also means that each CPU executing * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() + * "func()" must have executed a memory barrier after the call_srcu() * but before the beginning of that SRCU read-side critical section. * Note that these guarantees include CPUs that are offline, idle, or * executing in user mode, as well as CPUs that are executing in the kernel. * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the * resulting SRCU callback function "func()", then both CPU A and CPU * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". + * interval between the call to call_srcu() and the invocation of "func()". * This guarantee applies even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). * -- cgit v1.2.3 From 17294ce6a41d3fee6a9bfc52387c107a4607c1c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Apr 2018 12:03:36 -0700 Subject: srcu: Document that srcu_funnel_gp_start() implies srcu_funnel_exp_start() This commit updates the header comment of srcu_funnel_gp_start() to document the fact that srcu_funnel_gp_start() does the work of srcu_funnel_exp_start(), in some cases by invoking it directly. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 615e414b0c1e..37bef2fe80e6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -641,6 +641,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * period s. Losers must either ensure that their desired grace-period * number is recorded on at least their leaf srcu_node structure, or they * must take steps to invoke their own callbacks. + * + * Note that this function also does the work of srcu_funnel_exp_start(), + * in some cases by directly invoking it. */ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, unsigned long s, bool do_norm) -- cgit v1.2.3 From 5ab07a8df4d6c958ca63640d3f2ef896f0679c05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 12:28:04 -0700 Subject: srcu: Add address of first callback to rcutorture output This commit adds the address of the first callback to the per-CPU rcutorture output in order to allow lost wakeups to be more efficiently tracked down. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 37bef2fe80e6..5a1a9a07b407 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1271,11 +1271,11 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_data *counts; + struct srcu_data *sdp; - counts = per_cpu_ptr(sp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; + sdp = per_cpu_ptr(sp->sda, cpu); + u0 = sdp->srcu_unlock_count[!idx]; + u1 = sdp->srcu_unlock_count[idx]; /* * Make sure that a lock is always counted if the corresponding @@ -1283,12 +1283,13 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) */ smp_rmb(); - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; + l0 = sdp->srcu_lock_count[!idx]; + l1 = sdp->srcu_lock_count[idx]; c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld %1p)", + cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); s0 += c0; s1 += c1; } -- cgit v1.2.3 From 90127d605f403d814f4986436871210bf8ceb335 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2018 10:29:18 -0700 Subject: torture: Make online/offline messages appear only for verbose=2 Some bugs reproduce quickly only at high CPU-hotplug rates, so the rcutorture TREE03 scenario now has only 200 milliseconds spacing between CPU-hotplug operations. At this rate, the torture-test pair of console messages per operation becomes a bit voluminous. This commit therefore converts the torture-test set of "verbose" kernel-boot arguments from bool to int, and prints the extra console messages only when verbose=2. The default is still verbose=1. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcuperf.c | 2 +- kernel/rcu/rcutorture.c | 2 +- kernel/torture.c | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 66272862070b..a55e80817dae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -79,7 +79,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -bool torture_init_begin(char *ttype, bool v); +bool torture_init_begin(char *ttype, int v); void torture_init_end(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..4a2e13870a9b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -57,7 +57,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e232846516b3..fb8094848906 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,7 +88,7 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); -torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 42fcb7f05fac..a5540bd831c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -101,7 +101,7 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; diff --git a/kernel/torture.c b/kernel/torture.c index 3de1efbecd6a..840fd33c1cda 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -53,7 +53,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); static char *torture_type; -static bool verbose; +static int verbose; /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -98,7 +98,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlining %d\n", torture_type, cpu); @@ -111,7 +111,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, "torture_onoff task: offline %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); @@ -147,7 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlining %d\n", torture_type, cpu); @@ -160,7 +160,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, "torture_onoff task: online %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlined %d\n", torture_type, cpu); @@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v) +bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { -- cgit v1.2.3 From 60500037637397dcc8ea3d3c2f16e05ea6695a86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 12:25:05 -0700 Subject: torture: Keep old-school dmesg format This commit adds "#define pr_fmt(fmt) fmt" to the torture-test files in order to keep the current dmesg format. Once Joe's commits have hit mainline, these definitions will be changed in order to automatically generate the dmesg line prefix that the scripts expect. This will have the beneficial side-effect of allowing printk() formats to be used more widely and of shortening some pr_*() lines. Signed-off-by: Paul E. McKenney Cc: Joe Perches --- kernel/locking/locktorture.c | 3 +++ kernel/rcu/rcuperf.c | 3 +++ kernel/rcu/rcutorture.c | 3 +++ kernel/torture.c | 3 +++ 4 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 4a2e13870a9b..57bef4fbfb31 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -21,6 +21,9 @@ * Davidlohr Bueso * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index fb8094848906..df29119b2013 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -19,6 +19,9 @@ * * Authors: Paul E. McKenney */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a5540bd831c4..5604bfac8df4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -22,6 +22,9 @@ * * See also: Documentation/RCU/torture.txt */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/torture.c b/kernel/torture.c index 840fd33c1cda..1ac24a826589 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -20,6 +20,9 @@ * Author: Paul E. McKenney * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include -- cgit v1.2.3 From 4bc8d55574dd316e43975651b9259c5c18d741fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 15:13:56 -0800 Subject: rcu: Add debugging info to assertion The WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp()) in rcu_gp_cleanup() triggers (inexplicably, of course) every so often. This commit therefore extracts more information. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..79c7fe978b17 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2084,7 +2084,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); rdp = this_cpu_ptr(rsp->rda); @@ -2294,6 +2295,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + rnp->completedqs = rnp->gpnum; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3930,6 +3932,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 78e051dffc5b..7365ac53fdd9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ /* In leaf rcu_node, each bit corresponds to */ @@ -453,6 +454,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..17b67ecf7dff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,8 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * ->exp_tasks pointers, respectively, to reference the newly * blocked tasks. */ - if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != @@ -535,6 +537,8 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -697,7 +701,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, @@ -841,6 +846,27 @@ void exit_rcu(void) __rcu_read_unlock(); } +/* + * Dump the blocked-tasks state, but limit the list dump to the + * specified number of elements. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + int i; + struct list_head *lhp; + + lockdep_assert_held(&rnp->lock); + pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); + pr_cont("\t->blkd_tasks"); + i = 0; + list_for_each(lhp, &rnp->blkd_tasks) { + pr_cont(" %p", lhp); + if (++i >= 10) + break; + } + pr_cont("\n"); +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; @@ -949,6 +975,14 @@ void exit_rcu(void) { } +/* + * Dump the guaranteed-empty blocked-tasks state. Trust but verify. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST -- cgit v1.2.3 From ce11fae8d43fe9a36823fbbfe7c44de775b7e346 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 9 Mar 2018 09:32:18 +0800 Subject: rcu: Use the proper lockdep annotation in dump_blkd_tasks() Sparse reported this: | kernel/rcu/tree_plugin.h:814:9: warning: incorrect type in argument 1 (different modifiers) | kernel/rcu/tree_plugin.h:814:9: expected struct lockdep_map const *lock | kernel/rcu/tree_plugin.h:814:9: got struct lockdep_map [noderef] * This is caused by using vanilla lockdep annotations on rcu_node::lock, and that requires accessing ->lock of rcu_node directly. However we need to keep rcu_node::lock __private to avoid breaking its extra ordering guarantee. And we have a dedicated lockdep annotation for rcu_node::lock, so use it. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 17b67ecf7dff..e387ea712758 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -855,7 +855,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int i; struct list_head *lhp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); pr_cont("\t->blkd_tasks"); i = 0; -- cgit v1.2.3 From 8c42b1f39fdf9fde7cfc4024397255f31a860db6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Apr 2018 11:04:46 -0700 Subject: rcu: Exclude near-simultaneous RCU CPU stall warnings There is a two-jiffy delay between the time that a CPU will self-report an RCU CPU stall warning and the time that some other CPU will report a warning on behalf of the first CPU. This has worked well in the past, but on busy systems, it is possible for the two warnings to overlap, which makes interpreting them extremely difficult. This commit therefore uses a cmpxchg-based timing decision that allows only one report in a given one-minute period (assuming default stall-warning Kconfig parameters). This approach will of course fail if you are seeing minute-long vCPU preemption, but in that case the overlapping RCU CPU stall warnings are the least of your worries. Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79c7fe978b17..b1fffa21b9e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1368,7 +1368,6 @@ static inline void panic_on_rcu_stall(void) static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; - long delta; unsigned long flags; unsigned long gpa; unsigned long j; @@ -1381,18 +1380,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (rcu_cpu_stall_suppress) return; - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - delta = jiffies - READ_ONCE(rsp->jiffies_stall); - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - WRITE_ONCE(rsp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1441,6 +1428,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) sched_show_task(current); } } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); rcu_check_gp_kthread_starvation(rsp); @@ -1485,6 +1476,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1508,6 +1500,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long gpnum; unsigned long gps; unsigned long j; + unsigned long jn; unsigned long js; struct rcu_node *rnp; @@ -1546,14 +1539,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; if (rcu_gp_in_progress(rsp) && - (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp, gpnum); -- cgit v1.2.3 From fcc63543650150629c8a873cbef3578770acecd9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 15 Jun 2018 12:06:31 -0700 Subject: rcu: Make expedited GPs handle CPU 0 being offline Currently, the parallelized initialization of expedited grace periods uses the workqueue associated with each rcu_node structure's ->grplo field. This works fine unless that CPU is offline. This commit therefore uses the CPU corresponding to the lowest-numbered online CPU, or just queues the work on WORK_CPU_UNBOUND if there are no online CPUs corresponding to this rcu_node structure. Note that this patch uses cpu_is_offline() instead of the usual approach of checking bits in the rcu_node structure's ->qsmaskinitnext field. This is safe because preemption is disabled across both the cpu_is_offline() check and the call to queue_work_on(). Signed-off-by: Boqun Feng [ paulmck: Disable preemption to close offline race window. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra feedback on CPU selection. ] Tested-by: Aneesh Kumar K.V --- kernel/rcu/tree_exp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c6385ee1af65..b3df3b770afb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -472,6 +472,7 @@ retry_ipi: static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, smp_call_func_t func) { + int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); @@ -493,7 +494,13 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_disable(); + cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi)) + cpu = WORK_CPU_UNBOUND; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_enable(); rnp->exp_need_flush = true; } -- cgit v1.2.3 From 26d950a9451336a6b5abc1c8ca6c21df58e8d89f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Apr 2018 20:44:11 -0700 Subject: rcu: Diagnostics for grace-period startup hangs This commit causes a splat if RCU is idle and a request for a new grace period is ignored for more than one second. This splat normally indicates that some code path asked for a new grace period, but failed to wake up the RCU grace-period kthread. Signed-off-by: Paul E. McKenney [ paulmck: Fix bug located by Dan Carpenter and his static checker. ] [ paulmck: Fix self-deadlock bug located 0day test robot. ] [ paulmck: Disable unless CONFIG_PROVE_RCU=y. ] --- kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b1fffa21b9e4..6ce82c009195 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1681,6 +1681,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; @@ -2113,6 +2114,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Advance CBs to reduce false positives below. */ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } @@ -2744,6 +2746,65 @@ static void force_quiescent_state(struct rcu_state *rsp) rcu_gp_kthread_wake(rsp); } +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void +rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(rsp); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || + rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, rsp->gp_req_activity + HZ) || + time_before(j, rsp->gp_activity + HZ) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, READ_ONCE(rsp->gpnum), + need_future_gp_element(rcu_get_root(rsp), 0), + need_future_gp_element(rcu_get_root(rsp), 1), + need_future_gp_element(rcu_get_root(rsp), 2), + need_future_gp_element(rcu_get_root(rsp), 3), + j - rsp->gp_req_activity, j - rsp->gp_activity, + rsp->gp_flags, rsp->name, + rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to @@ -2755,7 +2816,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; WARN_ON_ONCE(!rdp->beenonline); @@ -2769,7 +2830,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { local_irq_restore(flags); } else { - rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2778,6 +2838,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) } } + rcu_check_gp_start_stall(rsp, rnp, rdp); + /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7365ac53fdd9..3c1942174c56 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -374,6 +374,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ + unsigned long gp_req_activity; /* Time of last GP request */ + /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit v1.2.3 From 18390aeae7010ee56b132dcfb663ba362a099d99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Apr 2018 15:06:05 -0700 Subject: rcu: Make rcu_gp_cleanup() write only once to ->gp_flags At the end of rcu_gp_cleanup(), if another grace period is needed, but not via rcu_accelerate_cbs(), the ->gp_flags field is written twice, once when making the new grace-period request, and once when clearing all other types of requests. This commit therefore adds an else-clause to avoid this double write. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ce82c009195..a9a4a260ea7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2117,8 +2117,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + } else { + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); } - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } -- cgit v1.2.3 From de30ad512a668b56e7ad7a5a7c379d7c5d138a94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 11:52:09 -0700 Subject: rcu: Introduce grace-period sequence numbers This commit adds grace-period sequence numbers (->gp_seq) to the rcu_state, rcu_node, and rcu_data structures, and updates them. It also checks for consistency between rsp->gpnum and rsp->gp_seq. These ->gp_seq counters will eventually replace the existing ->gpnum and ->completed counters, allowing a single memory access to determine whether or not a grace period is in progress and if so, which one. This in turn will enable changes that will reduce ->lock contention on the leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++++++++++++++- kernel/rcu/tree.h | 3 +++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a9a4a260ea7d..467cd8e5c6ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,6 +97,7 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -1849,6 +1850,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } + if (rdp->gp_seq != rnp->gp_seq) + rdp->gp_seq = rnp->gp_seq; return ret; } @@ -1910,7 +1913,10 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); /* Record GP times before starting GP, hence smp_store_release(). */ + WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); smp_store_release(&rsp->gpnum, rsp->gpnum + 1); + smp_mb(); /* Pairs with barriers in stall-warning code. */ + rcu_seq_start(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1984,6 +1990,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rnp->gpnum, rsp->gpnum); if (WARN_ON_ONCE(rnp->completed != rsp->completed)) WRITE_ONCE(rnp->completed, rsp->completed); + WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -2050,6 +2057,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; + unsigned long new_gp_seq; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2079,12 +2087,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * all of the rcu_node structures before the beginning of the next * grace period is recorded in any of the rcu_node structures. */ + new_gp_seq = rsp->gp_seq; + rcu_seq_end(&new_gp_seq); rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); + WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; @@ -2098,10 +2109,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); + rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ @@ -3612,6 +3624,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; + rdp->gp_seq = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3991,6 +4004,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->gp_seq = rsp->gp_seq; rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3c1942174c56..50a28d1cf5a1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -211,6 +212,7 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ @@ -343,6 +345,7 @@ struct rcu_state { /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ -- cgit v1.2.3 From dee4f42298bba030e84035aca5c114f9fee8fa6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 15:30:28 -0700 Subject: rcu: Move rcu_gp_slow() to ->gp_seq This commit moves rcu_gp_slow() to ->gp_seq. This function only uses the grace-period number to modulate delay, so rcu_seq_ctr(rsp->gp_seq) gets the same effect, at least in cases where the delay is to happen more than four times per wrap of an unsigned long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 467cd8e5c6ff..3c3af7e2758f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1879,7 +1879,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_gp_slow(struct rcu_state *rsp, int delay) { if (delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + !(rcu_seq_ctr(rsp->gp_seq) % + (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } -- cgit v1.2.3 From 17ef2fe97c8c8e754e4a702c42f8e5b0ffadf4dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 11:39:34 -0700 Subject: rcu: Make rcutorture's batches-completed API use ->gp_seq The rcutorture test invokes rcu_batches_started(), rcu_batches_completed(), rcu_batches_started_bh(), rcu_batches_completed_bh(), rcu_batches_started_sched(), and rcu_batches_completed_sched() to do grace-period consistency checks, and rcuperf uses the _completed variants for statistics. These functions use ->gpnum and ->completed. This commit therefore replaces them with rcu_get_gp_seq(), rcu_bh_get_gp_seq(), and rcu_sched_get_gp_seq(), adjusting rcutorture and rcuperf to make use of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 18 ++++++----------- kernel/rcu/rcuperf.c | 26 +++++++++---------------- kernel/rcu/rcutorture.c | 50 +++++++++++++++++------------------------------- kernel/rcu/tree.c | 51 ++++++++++++------------------------------------- 4 files changed, 45 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index db0870acfdff..f0907f9f6cd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -463,12 +463,9 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU -static inline unsigned long rcu_batches_started(void) { return 0; } -static inline unsigned long rcu_batches_started_bh(void) { return 0; } -static inline unsigned long rcu_batches_started_sched(void) { return 0; } -static inline unsigned long rcu_batches_completed(void) { return 0; } -static inline unsigned long rcu_batches_completed_bh(void) { return 0; } -static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long @@ -480,12 +477,9 @@ static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_get_gp_seq(void); +unsigned long rcu_bh_get_gp_seq(void); +unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index df29119b2013..2b5a613afcf3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -138,8 +138,7 @@ struct rcu_perf_ops { void (*cleanup)(void); int (*readlock)(void); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,8 +178,7 @@ static struct rcu_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -209,8 +207,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .init = rcu_sync_perf_init, .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -266,8 +263,7 @@ static struct rcu_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -295,8 +291,7 @@ static struct rcu_perf_ops srcud_ops = { .cleanup = srcu_sync_perf_cleanup, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -325,8 +320,7 @@ static struct rcu_perf_ops sched_ops = { .init = rcu_sync_perf_init, .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -353,8 +347,7 @@ static struct rcu_perf_ops tasks_ops = { .init = rcu_sync_perf_init, .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -447,8 +440,7 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = - cur_ops->completed(); + b_rcu_perf_writer_started = cur_ops->get_gp_seq(); } } @@ -505,7 +497,7 @@ retry: cur_ops->exp_completed() / 2; } else { b_rcu_perf_writer_finished = - cur_ops->completed(); + cur_ops->get_gp_seq(); } if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5604bfac8df4..1f66597c7783 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -264,8 +264,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -305,10 +304,10 @@ static void rcu_read_delay(struct torture_random_state *rrsp) * force_quiescent_state. */ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); mdelay(longdelay_ms); - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } @@ -400,8 +399,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -442,8 +440,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -486,8 +483,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, @@ -575,8 +571,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -613,8 +608,7 @@ static struct rcu_torture_ops srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -651,8 +645,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -690,8 +683,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, @@ -1104,10 +1096,7 @@ static void rcu_torture_timer(struct timer_list *unused) unsigned long long ts; idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1131,7 +1120,7 @@ static void rcu_torture_timer(struct timer_list *unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1139,8 +1128,8 @@ static void rcu_torture_timer(struct timer_list *unused) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1187,10 +1176,7 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1212,7 +1198,7 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1220,8 +1206,8 @@ rcu_torture_reader(void *arg) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c3af7e2758f..547112bec26a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -530,58 +530,31 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU batches started thus far for debug & stats. + * Return the number of RCU GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started(void) +unsigned long rcu_get_gp_seq(void) { - return rcu_state_p->gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started); +EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched batches started thus far for debug & stats. + * Return the number of RCU-sched GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_sched(void) +unsigned long rcu_sched_get_gp_seq(void) { - return rcu_sched_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_sched); +EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); /* - * Return the number of RCU BH batches started thus far for debug & stats. + * Return the number of RCU-bh GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_bh(void) +unsigned long rcu_bh_get_gp_seq(void) { - return rcu_bh_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_bh); - -/* - * Return the number of RCU batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_state_p->completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU-sched batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); /* * Return the number of RCU expedited batches completed thus far for -- cgit v1.2.3 From 78c5a67f1788f59d991c8e78d674ad3c8542f0ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 13:32:28 -0700 Subject: rcu: Convert rcu_check_gp_kthread_starvation() to GP sequence number This commit switches rcu_check_gp_kthread_starvation() from printing ->gpnum and ->completed to printing ->gp_seq upon detecting a starving RCU grace-period kthread during an RCU CPU stall warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 547112bec26a..002f85357226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1276,9 +1276,9 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, + (long)rcu_seq_current(&rsp->gp_seq), rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0, -- cgit v1.2.3 From c9a24e2d0c7d33b141167f5fa13f95cf6d35cb1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:54:46 -0700 Subject: rcu: Make quiescent-state reporting use ->gp_seq This commit switches the functions reporting quiescent states from use of ->gpnum to ->gp_seq. In either case, the point is to handle races where a given grace period ends before a quiescent state can be reported. Failing to catch these races would result in too-short grace periods, hence the checking. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 002f85357226..a54587dc13f0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2242,7 +2242,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * must be represented by the same rcu_node structure (which need not be a * leaf rcu_node structure, though it often will be). The gps parameter * is the grace-period snapshot, which means that the quiescent states - * are valid only if rnp->gpnum is equal to gps. That structure's lock + * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. */ static void @@ -2257,7 +2257,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { + if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the @@ -2335,8 +2335,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy, tracking current ->gpnum. */ - gps = rnp->gpnum; + /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ + gps = rnp->gp_seq; mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ @@ -2357,8 +2357,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2383,7 +2383,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); @@ -2688,8 +2688,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) } } if (mask != 0) { - /* Idle/offline CPUs, report (releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* Idle/offline CPUs, report (releases rnp->lock). */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From e4be81a2ed3a7356a2c22c7571af622ceb57eb2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 15:16:50 -0700 Subject: rcu: Convert conditional grace-period primitives to ->gp_seq This commit converts get_state_synchronize_rcu(), cond_synchronize_rcu(), get_state_synchronize_sched(), and cond_synchronize_sched() from ->gpnum and ->completed to ->gp_seq. Note that this also introduces a full memory barrier in the already-done paths off cond_synchronize_rcu() and cond_synchronize_sched(), as work with LKMM indicates that the earlier smp_load_acquire() were insufficiently strong in some situations where these two functions were called just as the grace period ended. In such cases, these two functions would not gain the benefit of memory ordering at the end of the grace period. Please note that the performance impact is negligible, as you shouldn't be using either function anywhere near a fastpath in any case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a54587dc13f0..fd2f582a6db0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3183,16 +3183,10 @@ unsigned long get_state_synchronize_rcu(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_rcu() - * and cond_synchronize_rcu(). - */ - return smp_load_acquire(&rcu_state_p->gpnum); + return rcu_seq_snap(&rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3212,15 +3206,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_state_p->completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) synchronize_rcu(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3235,16 +3224,10 @@ unsigned long get_state_synchronize_sched(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_sched() - * and cond_synchronize_sched(). - */ - return smp_load_acquire(&rcu_sched_state.gpnum); + return rcu_seq_snap(&rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_sched); @@ -3264,15 +3247,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_sched); */ void cond_synchronize_sched(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_sched_state.completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) synchronize_sched(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -- cgit v1.2.3 From 67e14c1e39d2d956300b3d6ad00f7708e3285531 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 16:01:46 -0700 Subject: rcu: Move RCU's grace-period-change code to ->gp_seq This commit moves __note_gp_changes(), note_gp_changes(), and __rcu_pending() to ->gp_seq, creating new rcu_seq_completed_gp() and rcu_seq_new_gp() functions for this purpose. Signed-off-by: Paul E. McKenney [ paulmck: Reinstate "cpuend: trace as suggested by Joel Fernandes. ] --- kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/tree.c | 39 ++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f0907f9f6cd0..7568a3fd0815 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -116,6 +116,23 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } +/* + * Has a grace period completed since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT(old, new & ~RCU_SEQ_STATE_MASK); +} + +/* + * Has a grace period started since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK, + new); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fd2f582a6db0..bcd659e65dfd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1790,24 +1790,23 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, raw_lockdep_assert_held_rcu_node(rnp); - /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed && - !unlikely(READ_ONCE(rdp->gpwrap))) { - - /* No grace period end, so just accelerate recent callbacks. */ - ret = rcu_accelerate_cbs(rsp, rnp, rdp); - - } else { - - /* Advance callbacks. */ - ret = rcu_advance_cbs(rsp, rnp, rdp); + if (rdp->gp_seq == rnp->gp_seq) + return false; /* Nothing to do. */ + /* Handle the ends of any preceding grace periods first. */ + if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); + } else { + ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ } - if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { + /* Now handle the beginnings of any new-to-this-CPU grace periods. */ + if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1823,8 +1822,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } - if (rdp->gp_seq != rnp->gp_seq) - rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ return ret; } @@ -1836,8 +1834,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && - rdp->completed == READ_ONCE(rnp->completed) && + if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); @@ -3286,12 +3283,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; - /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ - return 1; - - /* Has a new RCU grace period started? */ - if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + /* Have RCU grace period completed or started? */ + if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; -- cgit v1.2.3 From a66ae8ae35de60a85d2d89689d8c2d6a4dc15d85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:06:08 -0700 Subject: rcu: Convert rcu_gpnum_ovf() to ->gp_seq This commit converts rcu_gpnum_ovf() to use ->gp_seq instead of ->gpnum. Same size unsigned long, so same approach. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcd659e65dfd..de2e2c5d64bb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1089,14 +1089,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow - * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no * code whatsoever. */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, + rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; -- cgit v1.2.3 From e05720b0977bd50707ea6cf296f99e709de3f760 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:58:58 -0700 Subject: rcu: Move rcu_implicit_dynticks_qs() to ->gp_seq This commit makes rcu_implicit_dynticks_qs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index de2e2c5d64bb..b3af3d24286c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit v1.2.3 From 03c8cb765a747c02fd8d3fade1efe9d529ad54bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 19:56:16 -0700 Subject: rcu: Move rcu_try_advance_all_cbs() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e387ea712758..d893899b72f4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1467,7 +1467,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if ((rdp->completed != rnp->completed || + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || unlikely(READ_ONCE(rdp->gpwrap))) && rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); -- cgit v1.2.3 From e0da2374c3881cb9a512e2718f9ca655a48de9db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 20:51:36 -0700 Subject: rcu: Move rcu_nocb_gp_get() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq. It uses rcu_seq_ctr() in order to shift away the state bits, so that the low-order bits of the result may safely be used to index ->nocb_gp_wq[]. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d893899b72f4..5b10904669c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1852,7 +1852,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { - return &rnp->nocb_gp_wq[rnp->completed & 0x1]; + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; } static void rcu_init_one_nocb(struct rcu_node *rnp) -- cgit v1.2.3 From ba04107fc901ddce49686944f7e038b4f0b0a359 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 21:25:01 -0700 Subject: rcu: Move rcu_gp_in_progress() to ->gp_seq This commit makes rcu_gp_in_progress() use ->gp_seq instead of ->completed and ->gpnum. The READ_ONCE() invocations are buried in rcu_seq_current(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3af3d24286c..56445f4c09a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,7 +219,7 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); + return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); } /* -- cgit v1.2.3 From 8aa670cdacc1820cb0597e4b4b413ef91ede2dd9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 14:15:40 -0700 Subject: rcu: Convert ->rcu_iw_gpnum to ->gp_seq This commit switches the interrupt-disabled detection mechanism to ->gp_seq. This mechanism is used as part of RCU CPU stall warnings, and detects cases where the stall is due to a CPU having interrupts disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 56445f4c09a8..2ddbd1cfb31a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1099,8 +1099,8 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); - if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) - rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; + if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) + rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* @@ -1134,7 +1134,7 @@ static void rcu_iw_handler(struct irq_work *iwp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; rdp->rcu_iw_pending = false; } raw_spin_unlock_rcu_node(rnp); @@ -1231,11 +1231,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); if (IS_ENABLED(CONFIG_IRQ_WORK) && - !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); rdp->rcu_iw_pending = true; - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } } @@ -3575,7 +3575,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gpnum = rnp->gpnum - 1; + rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 50a28d1cf5a1..6d6cbc8b3a9c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -286,7 +286,7 @@ struct rcu_data { /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ - unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ + unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5b10904669c5..bc32e1f434a6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1763,7 +1763,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], -- cgit v1.2.3 From d43a5d32e125db1e34641922e52baaa4fee3510a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 18:50:06 -0700 Subject: rcu: Convert ->completedqs to ->gp_seq This commit switches the quiescent-state no-backtracking checks from ->gpnum and ->completed to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ddbd1cfb31a..da2ca9cc078b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2278,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->completedqs = rnp->gpnum; + rnp->completedqs = rnp->gp_seq; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3951,7 +3951,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; - rnp->completedqs = rsp->completed; + rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bc32e1f434a6..2a81d243521f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -262,7 +262,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; @@ -537,7 +537,7 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit v1.2.3 From 29365e563b1e4e5bfde211280d37dc6127c019ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 10:57:36 -0700 Subject: rcu: Convert grace-period requests to ->gp_seq This commit converts the grace-period request code paths from ->completed and ->gpnum to ->gp_seq. The need_future_gp_element() macro encapsulates the shift operation required to use ->gp_seq as an index to the ->need_future_gp[] array. The rcu_cbs_completed() function is removed in favor of the rcu_seq_snap() function. The rcu_start_this_gp() gets some temporary consistency checks and uses rcu_seq_done(), rcu_seq_current(), rcu_seq_state(), and rcu_gp_in_progress() in place of the earlier open-coded comparisons of ->gpnum and ->completed. The rcu_future_gp_cleanup() function replaces use of ->completed with ->gp_seq. The rcu_accelerate_cbs() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap(). The rcu_advance_cbs() function replaces an access to >completed with one to ->gp_seq and adds some temporary warnings. The rcu_nocb_wait_gp() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap() and an open-coded comparison with rcu_seq_done(). The temporary warnings will be removed when the various ->gpnum and ->completed fields are removed. Their purpose is to locate code who might still be using ->gpnum and ->completed. (Much easier that way than trying to trace down the causes of too-short grace periods and grace-period hangs!) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 84 ++++++++++++------------------------------------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 ++-- 3 files changed, 24 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da2ca9cc078b..ffa45b6175d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1548,52 +1548,6 @@ void rcu_cpu_stall_reset(void) WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } -/* - * Determine the value that ->completed will have at the end of the - * next subsequent grace period. This is used to tag callbacks so that - * a CPU can invoke callbacks in a timely fashion even if that CPU has - * been dyntick-idle for an extended period with callbacks under the - * influence of RCU_FAST_NO_HZ. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static unsigned long rcu_cbs_completed(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * If RCU is idle, we just wait for the next grace period. - * But we can only be sure that RCU is idle if we are looking - * at the root rcu_node structure -- otherwise, a new grace - * period might have started, but just not yet gotten around - * to initializing the current non-root rcu_node structure. - */ - if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) - return rnp->completed + 1; - - /* - * If the current rcu_node structure believes that RCU is - * idle, and if the rcu_state structure does not yet reflect - * the start of a new grace period, then the next grace period - * will suffice. The memory barrier is needed to accurately - * sample the rsp->gpnum, and pairs with the second lock - * acquisition in rcu_gp_init(), which is augmented with - * smp_mb__after_unlock_lock() for this purpose. - */ - if (rnp->gpnum == rnp->completed) { - smp_mb(); /* See above block comment. */ - if (READ_ONCE(rsp->gpnum) == rnp->completed) - return rnp->completed + 1; - } - - /* - * Otherwise, wait for a possible partial grace period and - * then the subsequent full grace period. - */ - return rnp->completed + 2; -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) @@ -1629,16 +1583,16 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ + WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + - need_future_gp_mask(), c)); if (need_future_gp_element(rnp_root, c) || - ULONG_CMP_GE(rnp_root->gpnum, c) || + rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && - rnp_root->gpnum != rnp_root->completed)) { + rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } @@ -1650,7 +1604,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* If GP already in progress, just leave, otherwise start one. */ - if (rnp_root->gpnum != rnp_root->completed) { + if (rcu_gp_in_progress(rsp)) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } @@ -1675,7 +1629,7 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->completed; + unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); @@ -1703,14 +1657,14 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) } /* - * If there is room, assign a ->completed number to any callbacks on - * this CPU that have not already been assigned. Also accelerate any - * callbacks that were previously assigned a ->completed number that has - * since proven to be too conservative, which can happen if callbacks get - * assigned a ->completed number while RCU is idle, but with reference to - * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. Returns an flag saying that we should - * awaken the RCU grace-period kthread. + * If there is room, assign a ->gp_seq number to any callbacks on this + * CPU that have not already been assigned. Also accelerate any callbacks + * that were previously assigned a ->gp_seq number that has since proven + * to be too conservative, which can happen if callbacks get assigned a + * ->gp_seq number while RCU is idle, but with reference to a non-root + * rcu_node structure. This function is idempotent, so it does not hurt + * to call it repeatedly. Returns an flag saying that we should awaken + * the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ @@ -1736,7 +1690,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_cbs_completed(rsp, rnp); + c = rcu_seq_snap(&rsp->gp_seq); if (rcu_segcblist_accelerate(&rdp->cblist, c)) ret = rcu_start_this_gp(rnp, rdp, c); @@ -1751,7 +1705,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and - * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... * Returns true if the RCU grace-period kthread needs to be awakened. @@ -1768,10 +1722,10 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return false; /* - * Find all callbacks whose ->completed numbers indicate that they + * Find all callbacks whose ->gp_seq numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - rcu_segcblist_advance(&rdp->cblist, rnp->completed); + rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1889,6 +1843,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) smp_store_release(&rsp->gpnum, rsp->gpnum + 1); smp_mb(); /* Pairs with barriers in stall-warning code. */ rcu_seq_start(&rsp->gp_seq); + if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ + pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6d6cbc8b3a9c..a21d403a6010 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,7 +174,7 @@ struct rcu_node { #define need_future_gp_mask() \ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) #define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) + ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) #define need_any_future_gp(rnp) \ ({ \ int __i; \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2a81d243521f..2036dc7426ac 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2105,7 +2105,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - c = rcu_cbs_completed(rdp->rsp, rnp); + c = rcu_seq_snap(&rdp->rsp->gp_seq); needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) @@ -2118,8 +2118,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( - rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); + rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], + (d = rcu_seq_done(&rnp->gp_seq, c))); if (likely(d)) break; WARN_ON(signal_pending(current)); -- cgit v1.2.3 From 471f87c3d91bd0884451c0e3071473476c165630 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 13:09:17 -0700 Subject: rcu: Make RCU CPU stall warnings use ->gp_seq This commit makes the RCU CPU stall-warning code in print_other_cpu_stall(), print_cpu_stall(), and check_cpu_stall() use ->gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++------------------------ kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ffa45b6175d9..9e619c4878d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1340,7 +1340,7 @@ static inline void panic_on_rcu_stall(void) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) { int cpu; unsigned long flags; @@ -1350,6 +1350,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1380,17 +1382,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); } else { - if (READ_ONCE(rsp->gpnum) != gpnum || - READ_ONCE(rsp->completed) == gpnum) { + if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; @@ -1442,9 +1443,9 @@ static void print_cpu_stall(struct rcu_state *rsp) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rsp->gp_start, - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); rcu_check_gp_kthread_starvation(rsp); @@ -1471,8 +1472,8 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long completed; - unsigned long gpnum; + unsigned long gs1; + unsigned long gs2; unsigned long gps; unsigned long j; unsigned long jn; @@ -1488,28 +1489,28 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. + * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, + * then rsp->gp_start, and finally another copy of rsp->gp_seq. + * These values are updated in the opposite order with memory + * barriers (or equivalent) during grace-period initialization + * and cleanup. Now, a false positive can occur if we get an new + * value of rsp->gp_start and a old value of rsp->jiffies_stall. + * But given the memory barriers, the only way that this can happen + * is if one grace period ends and another starts between these + * two fetches. This is detected by comparing the second fetch + * of rsp->gp_seq with the previous fetch from rsp->gp_seq. * * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = READ_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ + gs1 = READ_ONCE(rsp->gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ gps = READ_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = READ_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rsp->gp_seq); + if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ @@ -1527,7 +1528,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gpnum); + print_other_cpu_stall(rsp, gs2); } } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2036dc7426ac..f4a88e3c388d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1755,12 +1755,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - if (rsp->gpnum == rdp->gpnum) { + ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; - } else { - ticks_title = "GPs behind"; - ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); -- cgit v1.2.3 From aebc82644b2c8eafa15e8c481fbafc1b41f4fbf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 06:42:51 -0700 Subject: rcutorture: Convert rcutorture_get_gp_data() to ->gp_seq SRCU has long used ->srcu_gp_seq, and now RCU uses ->gp_seq. This commit therefore moves the rcutorture_get_gp_data() function from a ->gpnum / ->completed pair to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 17 ++++++----------- kernel/rcu/rcutorture.c | 24 ++++++++++-------------- kernel/rcu/srcutree.c | 5 ++--- kernel/rcu/tree.c | 5 ++--- 4 files changed, 20 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7568a3fd0815..003671825d62 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -425,7 +425,7 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, @@ -435,13 +435,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) + int *flags, unsigned long *gp_seq) { *flags = 0; - *gpnum = 0; - *completed = 0; + *gp_seq = 0; } static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } @@ -461,21 +458,19 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_idx; - *gpnum = *completed; + *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1f66597c7783..81fb43530d64 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1313,18 +1313,16 @@ rcu_torture_stats_print(void) if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags = 0; - unsigned long __maybe_unused gpnum = 0; - unsigned long __maybe_unused completed = 0; + unsigned long __maybe_unused gp_seq = 0; rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gpnum, &completed); + &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); + &flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - gpnum, completed, flags, + rcu_torture_writer_state, gp_seq, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { @@ -1605,8 +1603,7 @@ static void rcu_torture_cleanup(void) { int flags = 0; - unsigned long gpnum = 0; - unsigned long completed = 0; + unsigned long gp_seq = 0; int i; rcutorture_record_test_transition(); @@ -1637,11 +1634,10 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); - pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", - cur_ops->name, gpnum, completed, flags); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + pr_alert("%s: End-test grace-period state: g%lu f%#x\n", + cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5a1a9a07b407..d6d6ea9738c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1249,13 +1249,12 @@ static void process_srcu(struct work_struct *work) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = rcu_seq_ctr(sp->srcu_gp_seq); - *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); + *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9e619c4878d3..4a528a062cd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -638,7 +638,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { struct rcu_state *rsp = NULL; @@ -658,8 +658,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, if (rsp == NULL) return; *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -- cgit v1.2.3 From 7a1d0f23ad70cd4813bf4b72735ea2c26a4f53fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 10:26:57 -0700 Subject: rcu: Move from ->need_future_gp[] to ->gp_seq_needed One problem with the ->need_future_gp[] array is that the grace-period assignment of each element changes as the grace periods complete. This means that it is necessary to hold a lock when checking this array to learn if a given grace period has already been requested. This increase lock contention, which is the opposite of helpful. This commit therefore replaces the ->need_future_gp[] with a single ->gp_seq_needed value and keeps it updated in the rcu_data structure. This will enable reliable lockless checking of whether or not a given grace period has already been requested. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree.h | 19 ++----------------- 2 files changed, 20 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a528a062cd4..1ede51690e4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1560,7 +1560,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp[] field. Returns true if there + * rcu_node structure's ->gp_seq_needed field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which @@ -1589,14 +1589,14 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (need_future_gp_element(rnp_root, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } - need_future_gp_element(rnp_root, c) = true; + rnp_root->gp_seq_needed = c; if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) @@ -1633,8 +1633,9 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = false; - needmore = need_any_future_gp(rnp); + needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); + if (!needmore) + rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ trace_rcu_this_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; @@ -2046,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (need_any_future_gp(rnp)) { + if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, TPS("CleanupMore")); needgp = true; @@ -2700,8 +2701,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_node *rnp_root = rcu_get_root(rsp); static atomic_t warned = ATOMIC_INIT(0); - if (!IS_ENABLED(CONFIG_PROVE_RCU) || - rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || @@ -2711,7 +2712,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || atomic_read(&warned)) { @@ -2723,7 +2725,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, rsp->gp_req_activity + HZ) || time_before(j, rsp->gp_activity + HZ) || atomic_xchg(&warned, 1)) { @@ -2731,12 +2734,9 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", - __func__, READ_ONCE(rsp->gpnum), - need_future_gp_element(rcu_get_root(rsp), 0), - need_future_gp_element(rcu_get_root(rsp), 1), - need_future_gp_element(rcu_get_root(rsp), 2), - need_future_gp_element(rcu_get_root(rsp), 3), + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, (long)READ_ONCE(rsp->gp_seq), + (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, rsp->gp_flags, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); @@ -3527,6 +3527,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3907,6 +3908,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; + rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a21d403a6010..9329c1ff695f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -88,6 +88,7 @@ struct rcu_node { /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -160,7 +161,6 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; @@ -170,22 +170,6 @@ struct rcu_node { bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; -/* Accessors for ->need_future_gp[] array. */ -#define need_future_gp_mask() \ - (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) -#define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) -#define need_any_future_gp(rnp) \ -({ \ - int __i; \ - bool __nonzero = false; \ - \ - for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || \ - READ_ONCE((rnp)->need_future_gp[__i]); \ - __nonzero; \ -}) - /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -213,6 +197,7 @@ struct rcu_data { unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ -- cgit v1.2.3 From ab5e869c1f7aa30a1210f5e8a277758b0599609f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 11:07:23 -0700 Subject: rcu: Make rcu_nocb_wait_gp() check if GP already requested This commit makes rcu_nocb_wait_gp() check rdp->gp_seq_needed to see if the current CPU already knows about the needed grace period having already been requested. If so, it avoids acquiring the corresponding leaf rcu_node structure's ->lock, thus decreasing contention. This optimization is intended for cases where either multiple leader rcuo kthreads are running on the same CPU or these kthreads are running on a non-offloaded (e.g., housekeeping) CPU. Signed-off-by: Paul E. McKenney [ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ] [ paulmck: Fix caching of furthest-future requested grace period. ] --- kernel/rcu/tree.c | 5 +++++ kernel/rcu/tree_plugin.h | 15 ++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1ede51690e4a..4826598867c3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1618,6 +1618,11 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: + /* Push furthest requested GP to leaf node and rcu_data structure. */ + if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + rnp->gp_seq_needed = rnp_root->gp_seq_needed; + rdp->gp_seq_needed = rnp_root->gp_seq_needed; + } if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); return ret; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f4a88e3c388d..ca73931f7b30 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2104,12 +2104,17 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + local_irq_save(flags); c = rcu_seq_snap(&rdp->rsp->gp_seq); - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + local_irq_restore(flags); + } else { + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_start_this_gp(rnp, rdp, c); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); + } /* * Wait for the grace period. Do so interruptibly to avoid messing -- cgit v1.2.3 From 477351f7829d2268769c5d545511081555066529 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 12:54:11 -0700 Subject: rcu: Convert rcu_grace_period tracepoint to gp_seq This commit makes the rcu_grace_period tracepoint use gp_seq instead of ->gpnum or ->completed. It also introduces a "cpuofl-bgp" string to less obscurely indicate when a CPU has gone offline while a grace period is waiting on it. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 13 +++++++------ kernel/rcu/tree.c | 43 +++++++++++++++++++++---------------------- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5936aac357ab..cd229e82ec8b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -52,6 +52,7 @@ TRACE_EVENT(rcu_utilization, * "cpuqs": CPU passes through a quiescent state. * "cpuonl": CPU comes online. * "cpuofl": CPU goes offline. + * "cpuofl-bgp": CPU goes offline while blocking a grace period. * "reqwait": GP kthread sleeps waiting for grace-period request. * "reqwaitsig": GP kthread awakened by signal from reqwait state. * "fqswait": GP kthread waiting until time to force quiescent states. @@ -63,24 +64,24 @@ TRACE_EVENT(rcu_utilization, */ TRACE_EVENT(rcu_grace_period, - TP_PROTO(const char *rcuname, unsigned long gpnum, const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, const char *gpevent), - TP_ARGS(rcuname, gpnum, gpevent), + TP_ARGS(rcuname, gp_seq, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(const char *, gpevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->gpevent = gpevent; ), TP_printk("%s %lu %s", - __entry->rcuname, __entry->gpnum, __entry->gpevent) + __entry->rcuname, __entry->gp_seq, __entry->gpevent) ); /* @@ -753,7 +754,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ -#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) +#define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4826598867c3..7ce85ad39af6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -234,7 +234,7 @@ void rcu_sched_qs(void) if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), + __this_cpu_read(rcu_sched_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) @@ -249,7 +249,7 @@ void rcu_bh_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gpnum), + __this_cpu_read(rcu_bh_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } @@ -1615,7 +1615,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1702,9 +1702,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } @@ -1774,7 +1774,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * go looking for one. */ rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); @@ -1851,7 +1851,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_seq_start(&rsp->gp_seq); if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); - trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1928,7 +1928,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); @@ -2048,7 +2048,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); @@ -2061,7 +2061,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); } else { WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); @@ -2087,7 +2087,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & @@ -2100,7 +2100,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwaitsig")); } @@ -2119,7 +2119,7 @@ static int __noreturn rcu_gp_kthread(void *arg) jiffies + 3 * j); } trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout(rsp->gp_wq, @@ -2134,12 +2134,12 @@ static int __noreturn rcu_gp_kthread(void *arg) if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsstart")); rcu_gp_fqs(rsp, first_gp_fqs); first_gp_fqs = false; trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); @@ -2158,7 +2158,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -2388,17 +2388,16 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask;) + RCU_TRACE(bool blkd;) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask;) - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - !!(rnp->qsmask & mask), - TPS("cpuofl")); + RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + trace_rcu_grace_period(rsp->name, rnp->gp_seq, + blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); } /* @@ -3538,7 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ca73931f7b30..aca9d187c509 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,7 +301,7 @@ static void rcu_preempt_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gpnum), + __this_cpu_read(rcu_data_p->gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ -- cgit v1.2.3 From abd13fdd9516e5baae2257721b921684ecb090d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:08:46 -0700 Subject: rcu: Convert rcu_future_grace_period tracepoint to gp_seq This commit makes the rcu_future_grace_period tracepoint use gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 22 +++++++++------------- kernel/rcu/tree.c | 7 +++---- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index cd229e82ec8b..286047d22314 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -103,16 +103,14 @@ TRACE_EVENT(rcu_grace_period, */ TRACE_EVENT(rcu_future_grace_period, - TP_PROTO(const char *rcuname, unsigned long gpnum, unsigned long completed, - unsigned long c, u8 level, int grplo, int grphi, - const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long c, + u8 level, int grplo, int grphi, const char *gpevent), - TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent), + TP_ARGS(rcuname, gp_seq, c, level, grplo, grphi, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) - __field(unsigned long, completed) + __field(unsigned long, gp_seq) __field(unsigned long, c) __field(u8, level) __field(int, grplo) @@ -122,8 +120,7 @@ TRACE_EVENT(rcu_future_grace_period, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; - __entry->completed = completed; + __entry->gp_seq = gp_seq; __entry->c = c; __entry->level = level; __entry->grplo = grplo; @@ -131,10 +128,9 @@ TRACE_EVENT(rcu_future_grace_period, __entry->gpevent = gpevent; ), - TP_printk("%s %lu %lu %lu %u %d %d %s", - __entry->rcuname, __entry->gpnum, __entry->completed, - __entry->c, __entry->level, __entry->grplo, __entry->grphi, - __entry->gpevent) + TP_printk("%s %lu %lu %u %d %d %s", + __entry->rcuname, __entry->gp_seq, __entry->c, __entry->level, + __entry->grplo, __entry->grphi, __entry->gpevent) ); /* @@ -755,7 +751,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) -#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ +#define trace_rcu_future_grace_period(rcuname, gp_seq, c, \ level, grplo, grphi, event) \ do { } while (0) #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7ce85ad39af6..066dbaacec30 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1552,9 +1552,8 @@ void rcu_cpu_stall_reset(void) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + rnp->level, rnp->grplo, rnp->grphi, s); } /* @@ -2053,7 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { - trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } -- cgit v1.2.3 From 598ce09480efb6b48799df60c66bac70bea5ef54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_preempt_task tracepoint to ->gp_seq This commit makes the rcu_preempt_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 892016ad0647..38dbd97d65a3 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -298,24 +298,24 @@ TRACE_EVENT(rcu_nocb_wake, */ TRACE_EVENT(rcu_preempt_task, - TP_PROTO(const char *rcuname, int pid, unsigned long gpnum), + TP_PROTO(const char *rcuname, int pid, unsigned long gp_seq), - TP_ARGS(rcuname, pid, gpnum), + TP_ARGS(rcuname, pid, gp_seq), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->pid = pid; ), TP_printk("%s %lu %d", - __entry->rcuname, __entry->gpnum, __entry->pid) + __entry->rcuname, __entry->gp_seq, __entry->pid) ); /* @@ -761,7 +761,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) -#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) +#define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index aca9d187c509..02ca3b4e6a8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -350,8 +350,8 @@ static void rcu_preempt_note_context_switch(bool preempt) trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) - ? rnp->gpnum - : rnp->gpnum + 1); + ? rnp->gp_seq + : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { -- cgit v1.2.3 From 865aa1e08d8aefdfd1f5d30ecfce1b8ef8cd520a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_unlock_preempted_task tracepoint to ->gp_seq This commit makes the rcu_unlock_preempted_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 38dbd97d65a3..95b7491196aa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -325,23 +325,23 @@ TRACE_EVENT(rcu_preempt_task, */ TRACE_EVENT(rcu_unlock_preempted_task, - TP_PROTO(const char *rcuname, unsigned long gpnum, int pid), + TP_PROTO(const char *rcuname, unsigned long gp_seq, int pid), - TP_ARGS(rcuname, gpnum, pid), + TP_ARGS(rcuname, gp_seq, pid), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->pid = pid; ), - TP_printk("%s %lu %d", __entry->rcuname, __entry->gpnum, __entry->pid) + TP_printk("%s %lu %d", __entry->rcuname, __entry->gp_seq, __entry->pid) ); /* @@ -762,7 +762,7 @@ TRACE_EVENT(rcu_barrier, do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) -#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) +#define trace_rcu_unlock_preempted_task(rcuname, gp_seq, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ca3b4e6a8f..a10b0e26ce19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -545,7 +545,7 @@ void rcu_read_unlock_special(struct task_struct *t) list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -708,7 +708,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); } WARN_ON_ONCE(rnp->qsmask); } -- cgit v1.2.3 From db023296f0115d2fe01fdabad54678f2b806da23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_quiescent_state_report tracepoint to ->gp_seq This commit makes the rcu_quiescent_state_report tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 95b7491196aa..ac4d9d4a1ebf 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -354,15 +354,15 @@ TRACE_EVENT(rcu_unlock_preempted_task, */ TRACE_EVENT(rcu_quiescent_state_report, - TP_PROTO(const char *rcuname, unsigned long gpnum, + TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long mask, unsigned long qsmask, u8 level, int grplo, int grphi, int gp_tasks), - TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks), + TP_ARGS(rcuname, gp_seq, mask, qsmask, level, grplo, grphi, gp_tasks), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(unsigned long, mask) __field(unsigned long, qsmask) __field(u8, level) @@ -373,7 +373,7 @@ TRACE_EVENT(rcu_quiescent_state_report, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->mask = mask; __entry->qsmask = qsmask; __entry->level = level; @@ -383,7 +383,7 @@ TRACE_EVENT(rcu_quiescent_state_report, ), TP_printk("%s %lu %lx>%lx %u %d %d %u", - __entry->rcuname, __entry->gpnum, + __entry->rcuname, __entry->gp_seq, __entry->mask, __entry->qsmask, __entry->level, __entry->grplo, __entry->grphi, __entry->gp_tasks) ); @@ -763,7 +763,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gp_seq, pid) do { } while (0) -#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ +#define trace_rcu_quiescent_state_report(rcuname, gp_seq, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 066dbaacec30..7c6c11d5479c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2229,7 +2229,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a10b0e26ce19..c8a2c7760121 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -566,7 +566,7 @@ void rcu_read_unlock_special(struct task_struct *t) empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), - rnp->gpnum, + rnp->gp_seq, 0, rnp->qsmask, rnp->level, rnp->grplo, -- cgit v1.2.3 From fee5997c17562e95fb1fecc142efb2da0934baa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_fqs tracepoint to ->gp_seq This commit makes the rcu_fqs tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree.c | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ac4d9d4a1ebf..7d3650cc9d30 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -398,26 +398,26 @@ TRACE_EVENT(rcu_quiescent_state_report, */ TRACE_EVENT(rcu_fqs, - TP_PROTO(const char *rcuname, unsigned long gpnum, int cpu, const char *qsevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, int cpu, const char *qsevent), - TP_ARGS(rcuname, gpnum, cpu, qsevent), + TP_ARGS(rcuname, gp_seq, cpu, qsevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, cpu) __field(const char *, qsevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->cpu = cpu; __entry->qsevent = qsevent; ), TP_printk("%s %lu %d %s", - __entry->rcuname, __entry->gpnum, + __entry->rcuname, __entry->gp_seq, __entry->cpu, __entry->qsevent) ); @@ -766,7 +766,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_quiescent_state_report(rcuname, gp_seq, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) -#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) +#define trace_rcu_fqs(rcuname, gp_seq, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting, dyntick) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c6c11d5479c..42e89d4f1e33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1111,7 +1111,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1161,7 +1161,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { @@ -1188,7 +1188,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* Check for the CPU being offline. */ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit v1.2.3 From ff3bb6f4d06247508489345ee90a8a9b6f3ffd3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 14:34:08 -0700 Subject: rcu: Remove ->gpnum and ->completed Now that everything has been converted to use ->gp_seq instead of ->gpnum and ->completed, this commit removes ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 +++++++++++---------------------------------- kernel/rcu/tree.h | 14 +------------- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 13 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 42e89d4f1e33..0aeddc908181 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -95,8 +95,6 @@ struct rcu_state sname##_state = { \ .rda = &sname##_data, \ .call = cr, \ .gp_state = RCU_GP_IDLE, \ - .gpnum = 0UL - 300UL, \ - .completed = 0UL - 300UL, \ .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ @@ -1349,8 +1347,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; - WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ - /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1582,8 +1578,6 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ - WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) @@ -1757,8 +1751,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); } else { ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ @@ -1772,7 +1764,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; @@ -1843,13 +1834,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - /* Record GP times before starting GP, hence smp_store_release(). */ - WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); - smp_store_release(&rsp->gpnum, rsp->gpnum + 1); - smp_mb(); /* Pairs with barriers in stall-warning code. */ + /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rsp->gp_seq); - if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ - pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1920,9 +1906,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gpnum, rsp->gpnum); - if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - WRITE_ONCE(rnp->completed, rsp->completed); WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); @@ -2012,13 +1995,13 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. + * Propagate new ->gp_seq value to rcu_node structures so that + * other CPUs don't have to wait until the start of the next grace + * period to process their callbacks. This also avoids some nasty + * RCU grace-period initialization races by forcing the end of + * the current grace period to be completely recorded in all of + * the rcu_node structures before the beginning of the next grace + * period is recorded in any of the rcu_node structures. */ new_gp_seq = rsp->gp_seq; rcu_seq_end(&new_gp_seq); @@ -2027,7 +2010,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); - WRITE_ONCE(rnp->completed, rsp->gpnum); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) @@ -2045,7 +2027,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ - WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; @@ -3496,9 +3477,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. + * offline event can be happening at a given time. Note also that we can + * accept some slop in the rsp->gp_seq access due to the fact that this + * CPU cannot possibly have any RCU callbacks in flight yet. */ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) @@ -3527,8 +3508,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ - rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -3908,8 +3887,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9329c1ff695f..3def94fc9c74 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -81,12 +81,6 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gpnum; /* Current grace period for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long completed; /* Last GP completed for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ @@ -192,10 +186,6 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - unsigned long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ @@ -203,7 +193,7 @@ struct rcu_data { union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool gpwrap; /* Possible gpnum/completed wrap. */ + bool gpwrap; /* Possible ->gp_seq wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -328,8 +318,6 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ - unsigned long gpnum; /* Current gp number. */ - unsigned long completed; /* # of last completed gp. */ unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8a2c7760121..3a6e04103de1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -690,7 +690,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock * must be held by the caller. * * Also, if there are blocked tasks on the list, they automatically -- cgit v1.2.3 From e44e73ca47b47510dac491329d453d82aea1d8d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 16:29:47 -0700 Subject: rcu: Make simple callback acceleration refer to rdp->gp_seq_needed Now that the rcu_data structure contains ->gp_seq_needed, create an rcu_accelerate_cbs_unlocked() helper function that locklessly checks to see if new callbacks' required grace period has already been requested. If so, update the callback list locally and again locklessly. (Though interrupts must be and are disabled to avoid racing with conflicting updates in interrupt handlers.) Otherwise, call rcu_accelerate_cbs() as before. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0aeddc908181..5643c135fb06 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1701,6 +1701,34 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return ret; } +/* + * Similar to rcu_accelerate_cbs(), but does not require that the leaf + * rcu_node structure's ->lock be held. It consults the cached value + * of ->gp_seq_needed in the rcu_data structure, and if that indicates + * that a new grace-period request be made, invokes rcu_accelerate_cbs() + * while holding the leaf rcu_node structure's ->lock. + */ +static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + bool needwake; + + lockdep_assert_irqs_disabled(); + c = rcu_seq_snap(&rsp->gp_seq); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + /* Old request still live, so mark recent callbacks. */ + (void)rcu_segcblist_accelerate(&rdp->cblist, c); + return; + } + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); +} + /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and @@ -2739,7 +2767,6 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; - bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); struct rcu_node *rnp = rdp->mynode; @@ -2752,15 +2779,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (!rcu_gp_in_progress(rsp) && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); - if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) + rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + local_irq_restore(flags); } rcu_check_gp_start_stall(rsp, rnp, rdp); @@ -2818,8 +2839,6 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { - bool needwake; - /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2846,13 +2865,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp = rdp->mynode; - - raw_spin_lock_rcu_node(rnp); - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_rcu_node(rnp); - if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; -- cgit v1.2.3 From a2165e416878b325747f871df4b236b49bf61486 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 May 2018 07:42:20 -0700 Subject: rcu: Don't funnel-lock above leaf node if GP in progress The old grace-period start code would acquire only the leaf's rcu_node structure's ->lock if that structure believed that a grace period was in progress. The new code advances to the leaf's parent in this case, needlessly acquiring then leaf's parent's ->lock. This commit therefore checks the grace-period state after marking the leaf with the need for the specified grace period, and if the leaf believes that a grace period is in progress, takes an early exit. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney [ paulmck: Add "Startedleaf" tracing as suggested by Joel Fernandes. ] --- include/trace/events/rcu.h | 4 ++-- kernel/rcu/tree.c | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 7d3650cc9d30..5ab1df0a2801 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -91,8 +91,8 @@ TRACE_EVENT(rcu_grace_period, * * "Startleaf": Request a grace period based on leaf-node data. * "Prestarted": Someone beat us to the request - * "Startedleaf": Leaf-node start proved sufficient. - * "Startedleafroot": Leaf-node start proved sufficient after checking root. + * "Startedleaf": Leaf node marked for future GP. + * "Startedleafroot": All nodes from leaf to root marked for future GP. * "Startedroot": Requested a nocb grace period based on root-node data. * "NoGPkthread": The RCU grace-period kthread has not yet started. * "StartWait": Start waiting for the requested grace period. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5643c135fb06..24a79e85b81f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1590,6 +1590,15 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, goto unlock_out; } rnp_root->gp_seq_needed = c; + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + /* + * We just marked the leaf, and a grace period + * is in progress, which means that rcu_gp_cleanup() + * will see the marking. Bail to reduce contention. + */ + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + goto unlock_out; + } if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) -- cgit v1.2.3 From 5b55072f22ba2ed136b7a1b6c5beea9ace8415a7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:40 -0700 Subject: rcu: Produce last "CleanupMore" trace only if late-breaking request Currently Tree RCU's clean-up code emits a "CleanupMore" trace event in response to late-arriving grace-period requests even if the grace period was already requested. This makes "CleanupMore" show up an extra time (in addition to once for each rcu_node structure that was previously marked with the request), and for no good reason. This commit therefore avoids emitting this trace message unless the the only request for this next grace period arrived during or after the cleanup scan of the rcu_node structures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 24a79e85b81f..73a33b82cfcd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2069,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { + if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; -- cgit v1.2.3 From 5ca0905f6787e930bc2a626cf1d8f69fab52acef Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:41 -0700 Subject: rcu: Fix cpustart tracepoint gp_seq number The "cpustart" trace event shows a stale gp_seq. This is because it uses rdp->gp_seq, which is updated only at the end of the __note_gp_changes() function. This commit therefore instead uses rnp->gp_seq. An alternative fix would be to update rdp->gp_seq earlier, but this would break RCU's detection of the beginning of a new-to-this-CPU grace period. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73a33b82cfcd..973250503d98 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1801,7 +1801,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); -- cgit v1.2.3 From 2e3e5e55010105f9d4351f68e15dbc43402a7794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 11:53:41 -0700 Subject: rcu: Make rcu_start_this_gp() check for grace period already started In the old days of ->gpnum and ->completed, the code requesting a new grace period checked to see if that grace period had already started, bailing early if so. The new-age ->gp_seq approach instead checks whether the grace period has already finished. A compensating change pushed the requested grace period down to the bottom of the tree, thus reducing lock contention and even eliminating it in some cases. But why not further reduce contention, especially on large systems, by doing both, especially given that the cost of doing both is extremely small? This commit therefore adds a new rcu_seq_started() function that checks whether a specified grace period has already started. It then uses this new function in place of rcu_seq_done() in the rcu_start_this_gp() function's funnel locking code. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/tree.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 003671825d62..1c5cbd9d7c97 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -107,6 +107,15 @@ static inline unsigned long rcu_seq_current(unsigned long *sp) return READ_ONCE(*sp); } +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not the + * corresponding update-side operation has started. + */ +static inline bool rcu_seq_started(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_LT((s - 1) & ~RCU_SEQ_STATE_MASK, READ_ONCE(*sp)); +} + /* * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 973250503d98..cbf2bcde5e60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1583,7 +1583,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_done(&rnp_root->gp_seq, c) || + rcu_seq_started(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); -- cgit v1.2.3 From d72193123c81ae6123d108b3be2096f3f13b25a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:24:41 -0700 Subject: rcutorture: Correctly handle grace-period sequence wrap The new ->gq_seq grace-period sequence numbers must be shifted down, which give artifacts when these numbers wrap. This commit therefore enables rcutorture and rcuperf to handle grace-period sequence numbers even if they do wrap. It does this by allowing a special subtraction function to be specified, and this function subtracts before shifting. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/rcuperf.c | 18 ++++++++++++++++-- kernel/rcu/rcutorture.c | 19 +++++++++++++------ kernel/rcu/tree.c | 6 +++--- 4 files changed, 41 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1c5cbd9d7c97..aa215d6355f8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -142,6 +142,15 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) new); } +/* + * Roughly how many full grace periods have elapsed between the collection + * of the two specified grace periods? + */ +static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) +{ + return (new - old) >> RCU_SEQ_CTR_SHIFT; +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 2b5a613afcf3..b080bc4a4f45 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -139,6 +139,7 @@ struct rcu_perf_ops { int (*readlock)(void); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,6 +180,7 @@ static struct rcu_perf_ops rcu_ops = { .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -208,6 +210,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -264,6 +267,7 @@ static struct rcu_perf_ops srcu_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -292,6 +296,7 @@ static struct rcu_perf_ops srcud_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -321,6 +326,7 @@ static struct rcu_perf_ops sched_ops = { .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -348,6 +354,7 @@ static struct rcu_perf_ops tasks_ops = { .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -355,6 +362,13 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -577,8 +591,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - b_rcu_perf_writer_finished - - b_rcu_perf_writer_started); + rcuperf_seq_diff(b_rcu_perf_writer_finished, + b_rcu_perf_writer_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 81fb43530d64..0481c7286875 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -265,6 +265,7 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -400,6 +401,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -441,6 +443,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -646,6 +649,7 @@ static struct rcu_torture_ops sched_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -695,6 +699,13 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -1127,9 +1138,7 @@ static void rcu_torture_timer(struct timer_list *unused) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1205,9 +1214,7 @@ rcu_torture_reader(void *arg) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cbf2bcde5e60..fa219eea0ae7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -532,7 +532,7 @@ static int rcu_pending(void); */ unsigned long rcu_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); + return READ_ONCE(rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(rcu_get_gp_seq); @@ -541,7 +541,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_seq); */ unsigned long rcu_sched_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); + return READ_ONCE(rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); @@ -550,7 +550,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); */ unsigned long rcu_bh_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); + return READ_ONCE(rcu_bh_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); -- cgit v1.2.3 From 3d18469a2bb3988e669d67e097eff42dd40663d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:47:30 -0700 Subject: rcu: Regularize resetting of rcu_data wrap indicator The rcu_data structure's ->gpwrap indicator is currently reset only when the CPU in question detects a new grace period. This is in theory sufficient because any CPU that has been out of action for long enough that its ->gpwrap indicator is set is guaranteed to see both the end of an old grace period and the start of a new one. However, the current code leaves a short window during which the ->gpwrap indicator has been reset but the corresponding ->gp_seq counter has not yet been brought up to date. This is harmless because interrupts are disabled, but it is likely to (at the very least) cause confusion. This commit therefore moves the resetting of ->gpwrap to follow the updating of ->gp_seq. While in the area, it also resets ->gp_seq_needed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fa219eea0ae7..161e8fb8b83f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1807,10 +1807,12 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); - WRITE_ONCE(rdp->gpwrap, false); - rcu_gpnum_ovf(rnp, rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ + if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); return ret; } -- cgit v1.2.3 From b73de91d6a4c97ed586b2a5a6ce7c6fe395d9a3b Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 20 May 2018 21:42:18 -0700 Subject: rcu: Rename the grace-period-request variables and parameters The name 'c' is used for variables and parameters holding the requested grace-period sequence number. However it is no longer very meaningful given the conversions from ->gpnum and (especially) ->completed to ->gp_seq. This commit therefore renames 'c' to 'gp_seq_req'. Previous patch discussion is at: https://patchwork.kernel.org/patch/10396579/ Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 15 ++++++++------- kernel/rcu/tree.c | 46 +++++++++++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5ab1df0a2801..759e7e83733d 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -103,15 +103,16 @@ TRACE_EVENT(rcu_grace_period, */ TRACE_EVENT(rcu_future_grace_period, - TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long c, - u8 level, int grplo, int grphi, const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, + unsigned long gp_seq_req, u8 level, int grplo, int grphi, + const char *gpevent), - TP_ARGS(rcuname, gp_seq, c, level, grplo, grphi, gpevent), + TP_ARGS(rcuname, gp_seq, gp_seq_req, level, grplo, grphi, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) __field(unsigned long, gp_seq) - __field(unsigned long, c) + __field(unsigned long, gp_seq_req) __field(u8, level) __field(int, grplo) __field(int, grphi) @@ -121,7 +122,7 @@ TRACE_EVENT(rcu_future_grace_period, TP_fast_assign( __entry->rcuname = rcuname; __entry->gp_seq = gp_seq; - __entry->c = c; + __entry->gp_seq_req = gp_seq_req; __entry->level = level; __entry->grplo = grplo; __entry->grphi = grphi; @@ -129,7 +130,7 @@ TRACE_EVENT(rcu_future_grace_period, ), TP_printk("%s %lu %lu %u %d %d %s", - __entry->rcuname, __entry->gp_seq, __entry->c, __entry->level, + __entry->rcuname, __entry->gp_seq, __entry->gp_seq_req, __entry->level, __entry->grplo, __entry->grphi, __entry->gpevent) ); @@ -751,7 +752,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) -#define trace_rcu_future_grace_period(rcuname, gp_seq, c, \ +#define trace_rcu_future_grace_period(rcuname, gp_seq, gp_seq_req, \ level, grplo, grphi, event) \ do { } while (0) #define trace_rcu_grace_period_init(rcuname, gp_seq, level, grplo, grphi, \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 161e8fb8b83f..8c31b1740afc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1546,13 +1546,18 @@ void rcu_cpu_stall_reset(void) /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) + unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, rnp->level, rnp->grplo, rnp->grphi, s); } /* + * rcu_start_this_gp - Request the start of a particular grace period + * @rnp: The leaf node of the CPU from which to start. + * @rdp: The rcu_data corresponding to the CPU from which to start. + * @gp_seq_req: The gp_seq of the grace period to start. + * * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each * rcu_node structure's ->gp_seq_needed field. Returns true if there @@ -1560,9 +1565,11 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. + * + * Returns true if the GP thread needs to be awakened else false. */ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c) + unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; @@ -1578,25 +1585,27 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_started(&rnp_root->gp_seq, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = c; + rnp_root->gp_seq_needed = gp_seq_req; if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, + TPS("Startedleaf")); goto unlock_out; } if (rnp_root != rnp && rnp_root->parent != NULL) @@ -1607,21 +1616,21 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { rnp->gp_seq_needed = rnp_root->gp_seq_needed; rdp->gp_seq_needed = rnp_root->gp_seq_needed; } @@ -1636,14 +1645,13 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); if (!needmore) rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ - trace_rcu_this_gp(rnp, rdp, c, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1679,7 +1687,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; + unsigned long gp_seq_req; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1698,9 +1706,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_seq_snap(&rsp->gp_seq); - if (rcu_segcblist_accelerate(&rdp->cblist, c)) - ret = rcu_start_this_gp(rnp, rdp, c); + gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) + ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) -- cgit v1.2.3 From df2bf8f7f776cef57e6b27690c7b78c86f259515 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:14 -0700 Subject: rcu: Use better variable names in funnel locking loop The funnel locking loop in rcu_start_this_gp uses rcu_root as a temporary variable while walking the combining tree. This causes a tiresome exercise of a code reader reminding themselves that rcu_root may not be root. Lets just call it rnp, and rename other variables as well to be more appropriate. Original patch: https://patchwork.kernel.org/patch/10396577/ Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Fix name in comment as well. ] --- kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c31b1740afc..1af58f4b8a25 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1554,7 +1554,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * rcu_start_this_gp - Request the start of a particular grace period - * @rnp: The leaf node of the CPU from which to start. + * @rnp_start: The leaf node of the CPU from which to start. * @rdp: The rcu_data corresponding to the CPU from which to start. * @gp_seq_req: The gp_seq of the grace period to start. * @@ -1568,74 +1568,74 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * Returns true if the GP thread needs to be awakened else false. */ -static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, +static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root; + struct rcu_node *rnp; /* * Use funnel locking to either acquire the root rcu_node * structure's lock or bail out if the need for this grace period - * has already been recorded -- or has already started. If there - * is already a grace period in progress in a non-leaf node, no - * recording is needed because the end of the grace period will - * scan the leaf rcu_node structures. Note that rnp->lock must - * not be released. + * has already been recorded -- or if that grace period has in + * fact already started. If there is already a grace period in + * progress in a non-leaf node, no recording is needed because the + * end of the grace period will scan the leaf rcu_node structures. + * Note that rnp_start->lock must not be released. */ - raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); - for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || - rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || - (rnp != rnp_root && - rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + raw_lockdep_assert_held_rcu_node(rnp_start); + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); + for (rnp = rnp_start; 1; rnp = rnp->parent) { + if (rnp != rnp_start) + raw_spin_lock_rcu_node(rnp); + if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp->gp_seq, gp_seq_req) || + (rnp != rnp_start && + rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + rnp->gp_seq_needed = gp_seq_req; + if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, gp_seq_req, + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); goto unlock_out; } - if (rnp_root != rnp && rnp_root->parent != NULL) - raw_spin_unlock_rcu_node(rnp_root); - if (!rnp_root->parent) + if (rnp != rnp_start && rnp->parent != NULL) + raw_spin_unlock_rcu_node(rnp); + if (!rnp->parent) break; /* At root, and perhaps also leaf. */ } /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { - rnp->gp_seq_needed = rnp_root->gp_seq_needed; - rdp->gp_seq_needed = rnp_root->gp_seq_needed; + if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { + rnp_start->gp_seq_needed = rnp->gp_seq_needed; + rdp->gp_seq_needed = rnp->gp_seq_needed; } - if (rnp != rnp_root) - raw_spin_unlock_rcu_node(rnp_root); + if (rnp != rnp_start) + raw_spin_unlock_rcu_node(rnp); return ret; } -- cgit v1.2.3 From 226ca5e76692e2c82c17e8e8eedab22043f6ffee Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:15 -0700 Subject: rcu: Identify grace period is in progress as we advance up the tree There's no need to keep checking the same starting node for whether a grace period is in progress as we advance up the funnel lock loop. Its sufficient if we just checked it in the start, and then subsequently checked the internal nodes as we advanced up the combining tree. This also makes sense because the grace-period updates propogate from the root to the leaf, so there's a chance we may find a grace period has started as we advance up, lets check for the same. Reported-by: Paul McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1af58f4b8a25..a6863b813f0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1598,11 +1598,12 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, goto unlock_out; } rnp->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* - * We just marked the leaf, and a grace period - * is in progress, which means that rcu_gp_cleanup() - * will see the marking. Bail to reduce contention. + * We just marked the leaf or internal node, and a + * grace period is in progress, which means that + * rcu_gp_cleanup() will see the marking. Bail to + * reduce contention. */ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); -- cgit v1.2.3 From 962aff03c315b508d980422db5b49b49e4382119 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 12:49:21 -0700 Subject: rcu: Clean up handling of tasks blocked across full-rcu_node offline Commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start") deferred handling of CPU-hotplug events until the start of the next grace period, but consider the following sequence of events: 1. A task is preempted within an RCU-preempt read-side critical section. 2. The CPU that this task was running on goes offline, along with all other CPUs sharing the corresponding leaf rcu_node structure. 3. The task resumes execution. 4. One of those CPUs comes back online before a new grace period starts. In step 2, the code in the next rcu_gp_init() invocation will (correctly) defer removing the leaf rcu_node structure from the upper-level bitmasks, and will (correctly) set that structure's ->wait_blkd_tasks field. During the ensuing interval, RCU will (correctly) track the tasks preempted on that structure because they must block any subsequent grace period. In step 3, the code in rcu_read_unlock_special() will (correctly) remove the task from the leaf rcu_node structure. From this point forward, RCU need not pay attention to this structure, at least not until one of the corresponding CPUs comes back online. In step 4, the code in the next rcu_gp_init() invocation will (incorrectly) invoke rcu_init_new_rnp(). This is incorrect because the corresponding rcu_cleanup_dead_rnp() was never invoked. This is nevertheless harmless because the upper-level bits are still set. So, no harm, no foul, right? At least, all is well until a little further into rcu_gp_init() invocation, which will notice that there are no longer any tasks blocked on the leaf rcu_node structure, conclude that there is no longer anything left over from step 2's offline operation, and will therefore invoke rcu_cleanup_dead_rnp(). But this invocation of rcu_cleanup_dead_rnp() is for the beginning of the earlier offline interval, and the previous invocation of rcu_init_new_rnp() is for the end of that same interval. That is right, they are invoked out of order. That cannot be good, can it? It turns out that this is not a (correctness!) problem because rcu_cleanup_dead_rnp() checks to see if any of the corresponding CPUs are online, and refuses to do anything if so. In other words, in the case where rcu_init_new_rnp() and rcu_cleanup_dead_rnp() execute out of order, they both have no effect. But this is at best an accident waiting to happen. This commit therefore adds logic to rcu_gp_init() so that rcu_init_new_rnp() and rcu_cleanup_dead_rnp() are always invoked in order, and so that neither are invoked at all in cases where RCU had to pay attention to the leaf rcu_node structure during the entire time that all corresponding CPUs were offline. And, while in the area, this commit reduces confusion by using formal parameters rather than local variables that just happen to have the same value at that particular point in the code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6863b813f0c..9a5ba6db7b60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1909,12 +1909,14 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ if (!oldmask != !rnp->qsmaskinit) { - if (!oldmask) /* First online CPU for this rcu_node. */ - rcu_init_new_rnp(rnp); - else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ - rnp->wait_blkd_tasks = true; - else /* Last offline CPU and can propagate. */ + if (!oldmask) { /* First online CPU for rcu_node. */ + if (!rnp->wait_blkd_tasks) /* Ever offline? */ + rcu_init_new_rnp(rnp); + } else if (rcu_preempt_has_tasks(rnp)) { + rnp->wait_blkd_tasks = true; /* blocked tasks */ + } else { /* Last offline CPU and can propagate. */ rcu_cleanup_dead_rnp(rnp); + } } /* @@ -1923,14 +1925,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) * still offline, propagate up the rcu_node tree and * clear ->wait_blkd_tasks. Otherwise, if one of this * rcu_node structure's CPUs has since come back online, - * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() - * checks for this, so just call it unconditionally). + * simply clear ->wait_blkd_tasks. */ if (rnp->wait_blkd_tasks && - (!rcu_preempt_has_tasks(rnp) || - rnp->qsmaskinit)) { + (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { rnp->wait_blkd_tasks = false; - rcu_cleanup_dead_rnp(rnp); + if (!rnp->qsmaskinit) + rcu_cleanup_dead_rnp(rnp); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2450,9 +2451,10 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) return; for (;;) { mask = rnp->grpmask; @@ -2461,7 +2463,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) break; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; - rnp->qsmask &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); if (rnp->qsmaskinit) { raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3479,6 +3482,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) struct rcu_node *rnp = rnp_leaf; raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; -- cgit v1.2.3 From c50cbe535c972150c2caf923239ef77e85c5ad60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 13:51:57 -0700 Subject: rcu: Fix an obsolete ->qsmaskinit comment Back in the old days, when grace-period initialization blocked CPU hotplug, the ->qsmaskinit mask was indeed updated at the time that a given CPU went offline. However, with the deferral of these updates until the beginning of the next grace period in commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start"), it is instead ->qsmaskinitnext that gets updated at that time. This commit therefore updates the obsolete comment. It also fixes punctuation while on the topic of comments mentioning ->qsmaskinit. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5ba6db7b60..05f69b787a57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2438,7 +2438,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * This function therefore goes up the tree of rcu_node structures, * clearing the corresponding bits in the ->qsmaskinit fields. Note that * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated + * updated. * * This function does check that the specified rcu_node structure has * all CPUs offline and no blocked tasks, so it is OK to invoke it @@ -3709,7 +3709,7 @@ void rcu_cpu_starting(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit + * function. We now remove it from the rcu_node tree's ->qsmaskinitnext * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -- cgit v1.2.3 From 8d672fa6bf68ffc36a0c5e4868499f86bbea2308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 14:46:43 -0700 Subject: rcu: Make rcu_init_new_rnp() stop upon already-set bit Currently, rcu_init_new_rnp() walks up the rcu_node combining tree, setting bits in the ->qsmaskinit fields on the way up. It walks up unconditionally, regardless of the initial state of these bits. This is OK because only the corresponding RCU grace-period kthread ever tests or sets these bits during runtime. However, it is also pointless, and it increases both memory and lock contention (albeit only slightly), so this commit stops the walk as soon as an already-set bit is encountered. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 05f69b787a57..3fe854a15d82 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3479,9 +3479,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) { long mask; + long oldmask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; @@ -3489,8 +3490,11 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) if (rnp == NULL) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ + oldmask = rnp->qsmaskinit; rnp->qsmaskinit |= mask; raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ + if (oldmask) + return; } } -- cgit v1.2.3 From c74859d1eb2d8578bdf6d78ba893e394085aba1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:05:27 -0700 Subject: rcu: Make rcu_report_unblock_qs_rnp() warn on violated preconditions If rcu_report_unblock_qs_rnp() is invoked on something other than preemptible RCU or if there are still preempted tasks blocking the current grace period, something went badly wrong in the caller. This commit therefore adds WARN_ON_ONCE() to these conditions, but leaving the legitimate reason for early exit (rnp->qsmask != 0) unwarned. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3fe854a15d82..85417584ffd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2308,8 +2308,10 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || - rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || + WARN_ON_ONCE(rsp != rcu_state_p) || + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || + rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } -- cgit v1.2.3 From 77cfc7bf24ba0ba37be54e224007847d485a860f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:00:10 -0700 Subject: rcu: Fix typo and add additional debug This commit fixes a typo and adds some additional debugging to the message emitted when a task blocking the current grace period is listed as blocking it when either that grace period ends or the next grace period begins. This commit also reformats the console message for readability. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_plugin.h | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85417584ffd0..a4277c1087d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2316,6 +2316,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; /* Still need more quiescent states! */ } + rnp->completedqs = rnp->gp_seq; rnp_p = rnp->parent; if (rnp_p == NULL) { /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3a6e04103de1..677b0c9f548d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -856,8 +856,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) struct list_head *lhp; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); - pr_cont("\t->blkd_tasks"); + pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + rnp->gp_seq, rnp->completedqs); + pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", + __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); -- cgit v1.2.3 From 91f63ced7dc4e80acd13386204327d5de00a672d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:05:45 -0700 Subject: rcu: Replace smp_wmb() with smp_store_release() for stall check This commit gets rid of the smp_wmb() in record_gp_stall_check_time() in favor of an smp_store_release(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4277c1087d9..439228b79811 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1246,9 +1246,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) unsigned long j1; rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - WRITE_ONCE(rsp->jiffies_stall, j + j1); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ rsp->jiffies_resched = j + j1 / 2; rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } -- cgit v1.2.3 From 928164351e700f91ab588f20fe470cac9db477a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 11:07:02 -0700 Subject: rcu: Prevent useless FQS scan after all CPUs have checked in The force_qs_rnp() function checks for ->qsmask being all zero, that is, all CPUs for the current rcu_node structure having already passed through quiescent states. But with RCU-preempt, this is not sufficient to report quiescent states further up the tree, so there are further checks that can initiate RCU priority boosting and also for races with CPU-hotplug operations. However, if neither of these further checks apply, the code proceeds to carry out a useless scan of an all-zero ->qsmask. This commit therefore adds code to release the current rcu_node structure's lock and continue on to the next rcu_node structure, thereby avoiding this useless scan. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 439228b79811..3efd591fcd15 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2672,6 +2672,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_report_unblock_qs_rnp() rlses ->lock */ continue; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; } for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); -- cgit v1.2.3 From 5554788e1d4253a92b794a9006b7ae2c10be52af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 10:14:34 -0700 Subject: rcu: Suppress false-positive offline-CPU lockdep-RCU splat The rcu_lockdep_current_cpu_online() function currently checks only the RCU-sched data structures to determine whether or not RCU believes that a given CPU is offline. Unfortunately, there are multiple flavors of RCU, which means that there is a short window of time during which the various flavors disagree as to whether or not a given CPU is offline. This can result in false-positive lockdep-RCU splats in which some other flavor of RCU tries to do something based on its view that the CPU is online, only to get hit with a lockdep-RCU splat because RCU-sched instead believes that the CPU is offline. This commit therefore changes rcu_lockdep_current_cpu_online() to scan all RCU flavors and to consider a given CPU to be online if any of the RCU flavors believe it to be online, thus preventing these false-positive splats. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3efd591fcd15..a7773fc72b0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1030,41 +1030,41 @@ void rcu_request_urgent_qs_task(struct task_struct *t) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* - * Is the current CPU online? Disable preemption to avoid false positives - * that could otherwise happen due to the current CPU number being sampled, - * this task being preempted, its old CPU being taken offline, resuming - * on some other CPU, then determining that its old CPU is now offline. - * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. Note also that it is OK - * for a CPU coming online to use RCU for one jiffy prior to marking itself - * online in the cpu_online_mask. Similarly, it is OK for a CPU going - * offline to continue to use RCU for one jiffy after marking itself - * offline in the cpu_online_mask. This leniency is necessary given the - * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the teardown - * of the CPU. + * Is the current CPU online as far as RCU is concerned? * - * This is also why RCU internally marks CPUs online during in the - * preparation phase and offline after the CPU has been taken down. + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. Because there are + * multiple flavors of RCU, and because this function can be called in the + * midst of updating the flavors while a given CPU coming online or going + * offline, it is necessary to check all flavors. If any of the flavors + * believe that given CPU is online, it is considered to be online. * - * Disable checking if in an NMI handler because we cannot safely report - * errors from NMI handlers anyway. + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. */ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - bool ret; + struct rcu_state *rsp; - if (in_nmi()) + if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); - rnp = rdp->mynode; - ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || - !rcu_scheduler_fully_active; + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { + preempt_enable(); + return true; + } + } preempt_enable(); - return ret; + return false; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -- cgit v1.2.3 From fece27760ff57a0ca16c37e144f6a58a2f1851e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 20:04:12 -0700 Subject: rcu: Suppress false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given rcu_node structure go offline. A new grace period starts just after the CPU-hotplug code path does its synchronize_rcu() for the last CPU, so at least this CPU is present in that structure's ->qsmask. 2. Before the grace period ends, a CPU comes back online, and not just any CPU, but the one corresponding to a non-zero bit in the leaf rcu_node structure's ->qsmask. 3. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 4. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU was all the way offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. This commit eliminates this false positive by adding code to the end of rcu_cleanup_dying_idle_cpu() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -before- the clearing of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a7773fc72b0c..72adf97458e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3731,6 +3731,11 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ + /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit v1.2.3 From 99990da1b3c00f6c05a330e06a76b9dbc8416d7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 09:45:00 -0700 Subject: rcu: Suppress more involved false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All but one of the CPUs corresponding to a given leaf rcu_node structure go offline. Each of these CPUs clears its bit in that structure's ->qsmaskinitnext field. 2. A new grace period starts, and rcu_gp_init() scans the leaf rcu_node structures, applying CPU-hotplug changes since the start of the previous grace period, including those changes in #1 above. This copies each leaf structure's ->qsmaskinitnext to its ->qsmask field, which represents the CPUs that this new grace period will wait on. Each copy operation is done holding the corresponding leaf rcu_node structure's ->lock, and at the end of this scan, rcu_gp_init() holds no locks. 3. The last CPU corresponding to #1's leaf rcu_node structure goes offline, clearing its bit in that structure's ->qsmaskinitnext field, but not touching the ->qsmaskinit field. Note that rcu_gp_init() is not currently holding any locks! This CPU does -not- report a quiescent state because the grace period has not yet initialized itself sufficiently to have set any bits in any of the leaf rcu_node structures' ->qsmask fields. 4. The rcu_gp_init() function continues initializing the new grace period, copying each leaf rcu_node structure's ->qsmaskinit field to its ->qsmask field while holding the corresponding ->lock. This sets the ->qsmask bit corresponding to #3's CPU. 5. Before the grace period ends, #3's CPU comes back online. Because te grace period has not yet done any force-quiescent-state scans (which would report a quiescent state on behalf of any offline CPUs), this CPU's ->qsmask bit is still set. 6. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 7. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU went offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. RCU's grace periods thus are working correctly. Or, more accurately, that remaining bugs in RCU's grace periods are elsewhere. This commit eliminates this false positive by adding code to the end of rcu_cpu_starting() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Nevertheless, this commit does -not- remove the need for the force-quiescent-state scans to check for offline CPUs, given that a CPU might remain offline indefinitely. And without the checks in the force-quiescent-state scans, the grace period would also persist indefinitely, which could result in hangs or memory exhaustion. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -after- the setting of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 72adf97458e3..6275ed3925e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3710,7 +3710,12 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit v1.2.3 From 0b107d24d9361132758374a7b007c7c74efa007f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 16:18:28 -0700 Subject: rcu: Suppress false-positive splats from mid-init task resume Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given leaf rcu_node structure are offline. 2. The first phase of the rcu_gp_init() function's grace-period initialization runs, and sets that rcu_node structure's ->qsmaskinit to zero, as it should. 3. One of the CPUs corresponding to that rcu_node structure comes back online. Note that because this CPU came online after the grace period started, this grace period can safely ignore this newly onlined CPU. 4. A task running on the newly onlined CPU enters an RCU-preempt read-side critical section, and is then preempted. Because the corresponding rcu_node structure's ->qsmask is zero, rcu_preempt_ctxt_queue() leaves the rcu_node structure's ->gp_tasks field NULL, as it should. 5. The rcu_gp_init() function continues running the second phase of grace-period initialization. The ->qsmask field of the parent of the aforementioned leaf rcu_node structure is set to not expect a quiescent state from the leaf, as is only right and proper. However, when rcu_gp_init() reaches the leaf, it invokes rcu_preempt_check_blocked_tasks(), which sees that the leaf's ->blkd_tasks list is non-empty, and therefore sets the leaf's ->gp_tasks field to reference the first task on that list. 6. The grace period ends before the preempted task resumes, which is perfectly fine, given that this grace period was under no obligation to wait for that task to exit its late-starting RCU-preempt read-side critical section. Unfortunately, the leaf's ->gp_tasks field is non-NULL, so rcu_gp_cleanup() splats. After all, it appears to rcu_gp_cleanup() that the grace period failed to wait for a task that was supposed to be blocking that grace period. This commit avoids this false-positive splat by adding a check of both ->qsmaskinit and ->wait_blkd_tasks to rcu_preempt_check_blocked_tasks(). If both ->qsmaskinit and ->wait_blkd_tasks are zero, then the task must have entered its RCU-preempt read-side critical section late (after all, the CPU that it is running on was not online at that time), which means that the upper-level rcu_node structure won't be waiting for anything on the leaf anyway. If ->wait_blkd_tasks is non-zero, then there is at least one task on ths rcu_node structure's ->blkd_tasks list whose RCU read-side critical section predates the current grace period. If ->qsmaskinit is non-zero, there is at least one CPU that was online at the start of the current grace period. Thus, if both are zero, there is nothing to wait for. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 677b0c9f548d..1c9a836af5b6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -703,7 +703,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); - if (rcu_preempt_has_tasks(rnp)) { + if (rcu_preempt_has_tasks(rnp) && + (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); -- cgit v1.2.3 From ec2c29765a4ab12c236ac5a89b89660222ff6b01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 09:34:17 -0700 Subject: rcu: Fix grace-period hangs from mid-init task resume Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. A task running on a given CPU is preempted in its RCU read-side critical section. 2. That CPU goes offline, and there are now no online CPUs corresponding to that CPU's leaf rcu_node structure. 3. The rcu_gp_init() function does the first phase of grace-period initialization, and sets the aforementioned leaf rcu_node structure's ->qsmaskinit field to all zeroes. Because there is a blocked task, it does not propagate the zeroing of either ->qsmaskinit or ->qsmaskinitnext up the rcu_node tree. 4. The task resumes on some other CPU and exits its critical section. There is no grace period in progress, so the resulting quiescent state is not reported up the tree. 5. The rcu_gp_init() function does the second phase of grace-period initialization, which results in the leaf rcu_node structure being initialized to expect no further quiescent states, but with that structure's parent expecting a quiescent-state report. The parent will never receive a quiescent state from this leaf rcu_node structure, so the grace period will hang, resulting in RCU CPU stall warnings. It would be good to get rid of the special fail-safe quiescent-state propagation checks. This commit therefore checks the leaf rcu_node structure's ->wait_blkd_tasks field during grace-period initialization. If this flag is set, the rcu_report_qs_rnp() is invoked to immediately report the possible quiescent state. While in the neighborhood, this commit also report quiescent states for any CPUs that went offline between the two phases of grace-period initialization, thus reducing grace-period delays and hopefully eventually allowing removal of offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6275ed3925e9..7f872721c54e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,6 +154,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); @@ -1858,7 +1861,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) */ static bool rcu_gp_init(struct rcu_state *rsp) { + unsigned long flags; unsigned long oldmask; + unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1951,7 +1956,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1962,7 +1967,12 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq_rcu_node(rnp); + /* Quiescent states for tasks on any now-offline CPUs. */ + mask = rnp->qsmask & ~rnp->qsmaskinitnext; + if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + else + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2233,6 +2243,10 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * is the grace-period snapshot, which means that the quiescent states * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. + * + * As a special case, if mask is zero, the bit-already-cleared check is + * disabled. This allows propagating quiescent state due to resumed tasks + * during grace-period initialization. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, @@ -2246,7 +2260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { + if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the -- cgit v1.2.3 From 1e64b15a4b102e1cd059d4d798b7a78f93341333 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 19:23:09 -0700 Subject: rcu: Fix grace-period hangs due to race with CPU offline Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. CPU 1 goes offline. 2. Because CPU 1 is the only CPU in the system blocking the current grace period, the grace period ends as soon as rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp() returns. 3. At this point, the leaf rcu_node structure's ->lock is no longer held: rcu_report_qs_rnp() has released it, as it must in order to awaken the RCU grace-period kthread. 4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext field still records CPU 1 as being online. This is absolutely necessary because the scheduler uses RCU (in this case on the wake-up path while awakening RCU's grace-period kthread), and ->qsmaskinitnext contains RCU's idea as to which CPUs are online. Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's bit from ->qsmaskinitnext would result in a lockdep-RCU splat due to RCU being used from an offline CPU. 5. RCU's grace-period kthread awakens, sees that the old grace period has completed and that a new one is needed. It therefore starts a new grace period, but because CPU 1's leaf rcu_node structure's ->qsmaskinitnext field still shows CPU 1 as being online, this new grace period is initialized to wait for a quiescent state from the now-offline CPU 1. 6. Without the fail-safe force-quiescent-state checks, there would be no quiescent state from the now-offline CPU 1, which would eventually result in RCU CPU stall warnings and memory exhaustion. It would be good to get rid of the special fail-safe quiescent-state propagation checks, and thus it would be good to fix things so that the above scenario cannot happen. This commit therefore adds a new ->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init() across the applying of buffered online and offline operations to the rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu() when buffering a new offline operation. This prevents rcu_gp_init() from acquiring the leaf rcu_node structure's lock during the interval between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(), which releases ->lock and the re-acquisition of that same lock. This in turn prevents the failure scenario outlined above, and will hopefully eventually allow removal of the offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7f872721c54e..50e4f7ebf8cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,6 +101,7 @@ struct rcu_state sname##_state = { \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ + .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -1900,11 +1901,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); + spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); continue; } @@ -1940,6 +1943,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); } /* @@ -3749,6 +3753,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ @@ -3757,6 +3762,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + spin_unlock(&rsp->ofl_lock); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3def94fc9c74..6683da6e4ecc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -363,6 +363,10 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + + spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + /* Synchronize offline with */ + /* GP pre-initialization. */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit v1.2.3 From 1f3e5f51b933cbc25e3da0cdbdac40716df04ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 10:35:33 -0700 Subject: rcu: Add RCU-preempt check for waiting on newly onlined CPU RCU should only be waiting on CPUs that were online at the time that the current grace period started. Failure to abide by this rule can result in confusing splats during grace-period cleanup and initialization. This commit therefore adds a check to RCU-preempt's preempted-task queuing that checks for waiting on newly onlined CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c9a836af5b6..6b85ce936ad4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,6 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); + /* RCU better not be waiting on newly onlined CPUs! */ + WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & + rdp->grpmask); /* * Decide where to queue the newly blocked task. In theory, -- cgit v1.2.3 From f34f2f5852e556ee1c3b3b294571086b1791008a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 13:40:25 -0700 Subject: rcu: Move grace-period pre-init delay after pre-init The main race with the early part of grace-period initialization appears to be with CPU hotplug. To more fully open this race window, this commit moves the rcu_gp_slow() from the beginning of the early initialization loop to follow that loop, thus widening the race window, especially for the rcu_node structures that are initialized last. This commit also expands rcutree.gp_preinit_delay from 3 to 12, giving the same overall delay in the grace period, but concentrated in the spot where it will do the most good. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50e4f7ebf8cf..c577cadcc4f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1900,7 +1900,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) * will handle subsequent offline CPUs. */ rcu_for_each_leaf_node(rsp, rnp) { - rcu_gp_slow(rsp, gp_preinit_delay); spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1945,6 +1944,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); spin_unlock(&rsp->ofl_lock); } + rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 5d2cc0bd50a0..b79ddb9eb9e8 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1,5 +1,5 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 -rcutree.gp_preinit_delay=3 +rcutree.gp_preinit_delay=12 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 -- cgit v1.2.3 From 17a8212b8de21ce1b9a91fb75c8a6fb337685b9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 14:30:02 -0700 Subject: rcu: Remove failsafe check for lost quiescent state Now that quiescent-state reporting is fully event-driven, this commit removes the check for a lost quiescent state from force_qs_rnp(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c577cadcc4f8..770f0df54015 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2317,8 +2317,9 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +static void __maybe_unused +rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2679,17 +2680,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_initiate_boost() releases rnp->lock */ continue; } - if (rnp->parent && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period - * initialization and task exiting RCU - * read-side critical section: Report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); - /* rcu_report_unblock_qs_rnp() rlses ->lock */ - continue; - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } -- cgit v1.2.3 From e05121ba5b81e2f85349f038642410578457f6db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 12:07:48 -0700 Subject: rcu: Remove CPU-hotplug failsafe from force-quiescent-state code path Now that quiescent states for newly offlined CPUs are reported either when that CPU goes offline or at the end of grace-period initialization, the CPU-hotplug failsafe in the force-quiescent-state code path is no longer needed. This commit therefore removes this failsafe. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 10 +++++----- kernel/rcu/tree.c | 9 +-------- kernel/rcu/tree.h | 1 - 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 759e7e83733d..a8d07feff6a0 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -391,11 +391,11 @@ TRACE_EVENT(rcu_quiescent_state_report, /* * Tracepoint for quiescent states detected by force_quiescent_state(). - * These trace events include the type of RCU, the grace-period number that - * was blocked by the CPU, the CPU itself, and the type of quiescent state, - * which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, "kick" - * when kicking a CPU that has been in dyntick-idle mode for too long, or - * "rqc" if the CPU got a quiescent state via its rcu_qs_ctr. + * These trace events include the type of RCU, the grace-period number + * that was blocked by the CPU, the CPU itself, and the type of quiescent + * state, which can be "dti" for dyntick-idle mode, "kick" when kicking + * a CPU that has been in dyntick-idle mode for too long, or "rqc" if the + * CPU got a quiescent state via its rcu_qs_ctr. */ TRACE_EVENT(rcu_fqs, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 770f0df54015..5f1a11f1f7bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,14 +1188,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } - /* Check for the CPU being offline. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); - rdp->offline_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; - } - /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -3718,6 +3710,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6683da6e4ecc..795d469c6f67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -217,7 +217,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long cond_resched_completed; /* Grace period that needs help */ /* from cond_resched(). */ -- cgit v1.2.3 From ff3cee39088b1931a432587059d66cd505f785dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 12:50:14 -0700 Subject: rcu: Add up-tree information to dump_blkd_tasks() diagnostics This commit updates dump_blkd_tasks() to print out quiescent-state bitmasks for the rcu_node structures further up the tree. This information helps debugging of interactions between CPU-hotplug operations and RCU grace-period initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6b85ce936ad4..f45ff97b0d51 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -858,13 +858,15 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { int i; struct list_head *lhp; + struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - rnp->gp_seq, rnp->completedqs); - pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", - __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); -- cgit v1.2.3 From 577389423187d8b51dfe6199297e579a3419b72b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 14:18:57 -0700 Subject: rcu: Add CPU online/offline state to dump_blkd_tasks() Interactions between CPU-hotplug operations and grace-period initialization can result in dump_blkd_tasks(). One of the first debugging actions in this case is to search back in dmesg to work out which of the affected rcu_node structure's CPUs are online and to determine the last CPU-hotplug operation affecting any of those CPUs. This can be laborious and error-prone, especially when console output is lost. This commit therefore causes dump_blkd_tasks() to dump the state of the affected rcu_node structure's CPUs and the last grace period during which the last offline and online operation affected each of these CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++-- kernel/rcu/tree.h | 12 +++++++++--- kernel/rcu/tree_plugin.h | 25 ++++++++++++++++++++----- 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f1a11f1f7bc..a2503ef1bbe2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1954,7 +1954,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rnp); + rcu_preempt_check_blocked_tasks(rsp, rnp); rnp->qsmask = rnp->qsmaskinit; WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) @@ -2063,7 +2063,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); @@ -3516,6 +3516,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); + rdp->rcu_ofl_gp_seq = rsp->gp_seq; + rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3711,6 +3715,8 @@ void rcu_cpu_starting(unsigned int cpu) /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); @@ -3738,6 +3744,8 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 795d469c6f67..f52bc059bfec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -255,12 +255,16 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) RCU CPU stall data. */ + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ + unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ + short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ + short rcu_onl_gp_flags; /* ->gp_flags at last online. */ int cpu; struct rcu_state *rsp; @@ -431,11 +435,13 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, + struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, + int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f45ff97b0d51..613372246a07 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -699,13 +699,14 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; @@ -854,10 +855,14 @@ void exit_rcu(void) * Dump the blocked-tasks state, but limit the list dump to the * specified number of elements. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { + int cpu; int i; struct list_head *lhp; + bool onl; + struct rcu_data *rdp; struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); @@ -877,6 +882,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) break; } pr_cont("\n"); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + rdp = per_cpu_ptr(rsp->rda, cpu); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", + cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + } } #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -949,7 +962,8 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } @@ -990,7 +1004,8 @@ void exit_rcu(void) /* * Dump the guaranteed-empty blocked-tasks state. Trust but verify. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } -- cgit v1.2.3 From fea3f222d3523dfdd0e86b11227d3cda20765102 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:47:30 -0700 Subject: rcu: Record ->gp_state for both phases of grace-period initialization Grace-period initialization first processes any recent CPU-hotplug operations, and then initializes state for the new grace period. These two phases of initialization are currently not distinguished in debug prints, but the distinction is valuable in a number of debug situations. This commit therefore introduces two new values for ->gp_state, RCU_GP_ONOFF and RCU_GP_INIT, in order to make this distinction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2503ef1bbe2..ee218d743226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1891,6 +1891,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ + rsp->gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rsp, rnp) { spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); @@ -1950,6 +1951,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ + rsp->gp_state = RCU_GP_INIT; rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f52bc059bfec..8077aff7ab40 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -380,16 +380,20 @@ struct rcu_state { #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ -#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ -#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ -#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */ +#define RCU_GP_INIT 4 /* Grace-period initialization. */ +#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ #ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", "RCU_GP_DONE_GPS", + "RCU_GP_ONOFF", + "RCU_GP_INIT", "RCU_GP_WAIT_FQS", "RCU_GP_DOING_FQS", "RCU_GP_CLEANUP", -- cgit v1.2.3 From f2e2df59786d7bd52e6e7e2d10c1c6ba433a0ee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:23:23 -0700 Subject: rcu: Add diagnostics for offline CPUs failing to report QS CPUs are expected to report quiescent states when coming online and when going offline, and grace-period initialization is supposed to handle any race conditions where a CPU's ->qsmask bit is set just after it goes offline. This commit adds diagnostics for the case where an offline CPU nevertheless has a grace period waiting on it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 ++++++++++++++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ee218d743226..d3333ee2c6f5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,6 +1188,27 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } + /* If waiting too long on an offline CPU, complain. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && + time_after(jiffies, rdp->rsp->gp_start + HZ)) { + bool onl; + struct rcu_node *rnp1; + + WARN_ON(1); /* Offline CPUs are supposed to report QS! */ + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", + __func__, rdp->cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + return 1; /* Break things loose after complaining. */ + } + /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -1967,6 +1988,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); /* Quiescent states for tasks on any now-offline CPUs. */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; + rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); else diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8077aff7ab40..d51e6edc8e83 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -90,6 +90,7 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ + unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ -- cgit v1.2.3 From 3949fa9bac090ad217534c30bc3b6572289abf21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 15:29:10 -0700 Subject: rcu: Make rcu_read_unlock_special() static Because rcu_read_unlock_special() is no longer used outside of kernel/rcu/tree_plugin.h, this commit makes it static. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 - kernel/rcu/tree_plugin.h | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 65163aa0bb04..67ec077c7ee5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -64,7 +64,6 @@ void rcu_barrier_tasks(void); void __rcu_read_lock(void); void __rcu_read_unlock(void); -void rcu_read_unlock_special(struct task_struct *t); void synchronize_rcu(void); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 613372246a07..54a251640f53 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -127,6 +127,7 @@ static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); +static void rcu_read_unlock_special(struct task_struct *t); /* * Tell them what RCU they are running. @@ -461,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -void rcu_read_unlock_special(struct task_struct *t) +static void rcu_read_unlock_special(struct task_struct *t) { bool empty_exp; bool empty_norm; -- cgit v1.2.3 From 07f27570dcd148a5f4de7dc3513c1d1cd069b362 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 11 May 2018 17:30:34 +0900 Subject: rcu: Improve rcu_note_voluntary_context_switch() reporting We expect a quiescent state of TASKS_RCU when cond_resched_tasks_rcu_qs() is called, no matter whether it actually be scheduled or not. However, it currently doesn't report the quiescent state when the task enters into __schedule() as it's called with preempt = true. So make it report the quiescent state unconditionally when cond_resched_tasks_rcu_qs() is called. And in TINY_RCU, even though the quiescent state of rcu_bh also should be reported when the tick interrupt comes from user, it doesn't. So make it reported. Lastly in TREE_RCU, rcu_note_voluntary_context_switch() should be reported when the tick interrupt comes from not only user but also idle, as an extended quiescent state. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney [ paulmck: Simplify rcutiny portion given no RCU-tasks for !PREEMPT. ] --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/tiny.c | 4 +--- kernel/rcu/tree.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 67ec077c7ee5..5cab15e7ec83 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -194,8 +194,8 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - if (!cond_resched()) \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_note_voluntary_context_switch_lite(current); \ + cond_resched(); \ } while (0) /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index a64eee0db39e..befc9321a89c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -122,10 +122,8 @@ void rcu_check_callbacks(int user) { if (user) rcu_sched_qs(); - else if (!in_softirq()) + if (user || !in_softirq()) rcu_bh_qs(); - if (user) - rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..19beabe73629 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2645,6 +2645,7 @@ void rcu_check_callbacks(int user) rcu_sched_qs(); rcu_bh_qs(); + rcu_note_voluntary_context_switch(current); } else if (!in_softirq()) { @@ -2660,8 +2661,7 @@ void rcu_check_callbacks(int user) rcu_preempt_check_callbacks(); if (rcu_pending()) invoke_rcu_core(); - if (user) - rcu_note_voluntary_context_switch(current); + trace_rcu_utilization(TPS("End scheduler-tick")); } -- cgit v1.2.3 From a7538352da722fae5cc95ae6656ea2013f5b8b21 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 May 2018 13:27:33 -0700 Subject: rcu: Use pr_fmt to prefix "rcu: " to logging output This commit also adjusts some whitespace while in the area. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney [ paulmck: Revert string-breaking %s as requested by Andy Shevchenko. ] --- kernel/rcu/rcuperf.c | 7 +++---- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/srcutree.c | 5 ++++- kernel/rcu/tree.c | 8 +++++--- kernel/rcu/tree_plugin.h | 10 ++++++---- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..00e395c0d7d0 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -680,12 +680,11 @@ rcu_perf_init(void) break; } if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", - perf_type); + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); pr_alert("rcu-perf types:"); for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_alert(" %s", perf_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..90a94fecdd73 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1755,8 +1755,8 @@ rcu_torture_init(void) torture_type); pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", torture_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d6d6ea9738c0..e526b56998af 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -390,7 +392,8 @@ void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", + __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 19beabe73629..6f2922168216 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -27,6 +27,9 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ + +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -1374,8 +1377,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", - rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4048,7 +4050,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", + pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 54a251640f53..dbfe90191e19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -74,8 +74,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - RCU_FANOUT); + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", + RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) @@ -88,11 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); if (rcu_fanout_leaf != RCU_FANOUT_LEAF) - pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", + rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST - pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", + kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif if (blimit != DEFAULT_RCU_BLIMIT) pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); -- cgit v1.2.3 From 6f56f714db067056c80f5d71510118f82872e34c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 May 2018 13:52:27 -0700 Subject: rcu: Improve RCU-tasks naming and comments The naming and comments associated with some RCU-tasks code make the faulty assumption that context switches due to cond_resched() are voluntary. As several people pointed out, this is not the case. This commit therefore updates function names and comments to better reflect current reality. Reported-by: Byungchul Park Reported-by: Joel Fernandes Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++------ include/linux/rcutiny.h | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/update.c | 27 ++++++++++++++------------- 4 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dacc90358b33..75e5b393cf44 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -158,11 +158,11 @@ static inline void rcu_init_nohz(void) { } } while (0) /* - * Note a voluntary context switch for RCU-tasks benefit. This is a - * macro rather than an inline function to avoid #include hell. + * Note a quasi-voluntary context switch for RCU-tasks's benefit. + * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU -#define rcu_note_voluntary_context_switch_lite(t) \ +#define rcu_tasks_qs(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ @@ -170,14 +170,14 @@ static inline void rcu_init_nohz(void) { } #define rcu_note_voluntary_context_switch(t) \ do { \ rcu_all_qs(); \ - rcu_note_voluntary_context_switch_lite(t); \ + rcu_tasks_qs(t); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU */ -#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_tasks_qs(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) rcu_all_qs() #define call_rcu_tasks call_rcu_sched #define synchronize_rcu_tasks synchronize_sched @@ -194,7 +194,7 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_tasks_qs(current); \ cond_resched(); \ } while (0) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7b3c82e8a625..8d9a0ea8f0b5 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -93,7 +93,7 @@ static inline void kfree_call_rcu(struct rcu_head *head, #define rcu_note_context_switch(preempt) \ do { \ rcu_sched_qs(); \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_tasks_qs(current); \ } while (0) static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f2922168216..ccc061acf887 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -457,7 +457,7 @@ void rcu_note_context_switch(bool preempt) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_dynticks.rcu_qs_ctr); if (!preempt) - rcu_note_voluntary_context_switch_lite(current); + rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c230a60ece4..5783bdf86e5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -507,14 +507,15 @@ early_initcall(check_cpu_stall_init); #ifdef CONFIG_TASKS_RCU /* - * Simple variant of RCU whose quiescent states are voluntary context switch, - * user-space execution, and idle. As such, grace periods can take one good - * long time. There are no read-side primitives similar to rcu_read_lock() - * and rcu_read_unlock() because this implementation is intended to get - * the system into a safe state for some of the manipulations involved in - * tracing and the like. Finally, this implementation does not support - * high call_rcu_tasks() rates from multiple CPUs. If this is required, - * per-CPU callback lists will be needed. + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. */ /* Global list of callbacks and associated lock. */ @@ -542,11 +543,11 @@ static struct task_struct *rcu_tasks_kthread_ptr; * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks() assumes * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. -- cgit v1.2.3 From 15651201fa055ec81d3669b36ab7c2fb12c3ce36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 14:41:41 -0700 Subject: rcu: Mark task as .need_qs less aggressively If any scheduling-clock interrupt interrupts an RCU-preempt read-side critical section, the interrupted task's ->rcu_read_unlock_special.b.need_qs field is set. This causes the outermost rcu_read_unlock() to incur the extra overhead of calling into rcu_read_unlock_special(). This commit reduces that overhead by setting ->rcu_read_unlock_special.b.need_qs only if the grace period has been in effect for more than one second. Why one second? Because this is comfortably smaller than the minimum RCU CPU stall-warning timeout of three seconds, but long enough that the .need_qs marking should happen quite rarely. And if your RCU read-side critical section has run on-CPU for a full second, it is not unreasonable to invest some CPU time in ending the grace period quickly. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dbfe90191e19..0239cf8a4be6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -730,6 +730,7 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) */ static void rcu_preempt_check_callbacks(void) { + struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { @@ -738,7 +739,9 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + !t->rcu_read_unlock_special.b.need_qs && + time_after(jiffies, rsp->gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } -- cgit v1.2.3 From 3b57a3994f330a102b91bd70c84c9530c0ae50f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 16:01:56 -0700 Subject: rcu: Inline rcu_dynticks_momentary_idle() into its sole caller The rcu_dynticks_momentary_idle() function is invoked only from rcu_momentary_dyntick_idle(), and neither function is particularly large. This commit therefore saves a few lines by inlining rcu_dynticks_momentary_idle() into rcu_momentary_dyntick_idle(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ccc061acf887..ebdbb5f96e5c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -385,20 +385,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) return snap != rcu_dynticks_snap(rdtp); } -/* - * Do a double-increment of the ->dynticks counter to emulate a - * momentary idle-CPU quiescent state. - */ -static void rcu_dynticks_momentary_idle(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &rdtp->dynticks); - - /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); -} - /* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the @@ -430,12 +416,17 @@ bool rcu_eqs_special_set(int cpu) * * We inform the RCU core by emulating a zero-duration dyntick-idle period. * - * The caller must have disabled interrupts. + * The caller must have disabled interrupts and must not be idle. */ static void rcu_momentary_dyntick_idle(void) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - rcu_dynticks_momentary_idle(); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } /* -- cgit v1.2.3 From c7037ff5249cee237b8c8a6f99998ae4f3916b60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 18:00:17 -0700 Subject: rcu: Clarify and correct the rcu_preempt_qs() header comment The rcu_preempt_qs() function only applies to the CPU, not the task. A task really is allowed to invoke this function while in an RCU-preempt read-side critical section, but only if it has first added itself to some leaf rcu_node structure's ->blkd_tasks list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0239cf8a4be6..07d1ad175994 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -294,13 +294,17 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * not in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. + * Record a preemptible-RCU quiescent state for the specified CPU. + * Note that this does not necessarily mean that the task currently running + * on the CPU is in a quiescent state: Instead, it means that the current + * grace period need not wait on any RCU read-side critical section that + * starts later on this CPU. It also means that if the current task is + * in an RCU read-side critical section, it has already added itself to + * some leaf rcu_node structure's ->blkd_tasks list. In addition to the + * current task, there might be any number of other tasks blocked while + * in an RCU read-side critical section. * - * As with the other rcu_*_qs() functions, callers to this function - * must disable preemption. + * Callers to this function must disable preemption. */ static void rcu_preempt_qs(void) { -- cgit v1.2.3 From 164ba3fc4864346dbc365f8b89d8888e1b6cd38c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 20:41:36 -0700 Subject: rcu: Remove unused rcu_kick_nohz_cpu() function The rcu_kick_nohz_cpu() function is no longer used, and the functionality it used to provide is now provided by a call to resched_cpu() in the force-quiescent-state function rcu_implicit_dynticks_qs(). This commit therefore removes rcu_kick_nohz_cpu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d51e6edc8e83..4e74df768c57 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -483,7 +483,6 @@ static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 07d1ad175994..75a91d58b8f7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2645,23 +2645,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -/* - * An adaptive-ticks CPU can potentially execute in kernel mode for an - * arbitrarily long period of time with the scheduling-clock tick turned - * off. RCU will be paying attention to this CPU because it is in the - * kernel, but the CPU cannot be guaranteed to be executing the RCU state - * machine because the scheduling-clock tick has been disabled. Therefore, - * if an adaptive-ticks CPU is failing to respond to the current grace - * period and has not be idle from an RCU perspective, kick it. - */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(cpu)) - smp_send_reschedule(cpu); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ -} - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit v1.2.3 From ab6b82147f471e31d9397c95d523631b6c8953f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:21:20 -0700 Subject: rcu: Remove unused local variable "cpu" One danger of using __maybe_unused is that the compiler doesn't yell at you when you remove the last reference, witness rcu_bind_gp_kthread() and its local variable "cpu". This commit removes this local variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75a91d58b8f7..2cc9bf0d363a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2670,8 +2670,6 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { - int __maybe_unused cpu; - if (!tick_nohz_full_enabled()) return; housekeeping_affine(current, HK_FLAG_RCU); -- cgit v1.2.3 From 95394e69c42f0da83a176936fdb28f6cac57ea69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from panic_on_rcu_stall() and rcu_blocking_is_gp() These functions are in kernel/rcu/tree.c, which is not an include file, so there is no problem dropping the "inline", especially given that these functions are nowhere near a fastpath. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ebdbb5f96e5c..3504ee35e226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1342,7 +1342,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } -static inline void panic_on_rcu_stall(void) +static void panic_on_rcu_stall(void) { if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); @@ -3080,7 +3080,7 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. */ -static inline int rcu_blocking_is_gp(void) +static int rcu_blocking_is_gp(void) { int ret; -- cgit v1.2.3 From eac45e586cd38a1b56aa716560002e68741b78a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_torture_print_module_parms() This function is in rcutorture.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcutorture run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 90a94fecdd73..57a4277ccc63 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1359,7 +1359,7 @@ rcu_torture_stats(void *arg) return 0; } -static inline void +static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG -- cgit v1.2.3 From 9622179519c52ead944c3b6a07aed9c6db3659e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_perf_print_module_parms() This function is in rcuperf.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcuperf run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 00e395c0d7d0..3e86940245d9 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -536,7 +536,7 @@ retry: return 0; } -static inline void +static void rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG -- cgit v1.2.3 From 51fbb910f52c8559a78665d203e55ab2b95e7126 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:15:40 -0700 Subject: rcu: Remove __maybe_unused from rcu_cpu_has_callbacks() The rcu_cpu_has_callbacks() function is now used in all configurations, so this commit removes the __maybe_unused. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3504ee35e226..cdc4fca0c4cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3315,7 +3315,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- cgit v1.2.3 From b06ae25a1e2b54b2b5bc589a4a118b7bb39159fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:32:51 -0700 Subject: rcu: Use RCU CPU stall timeout for rcu_check_gp_start_stall() Currently, rcu_check_gp_start_stall() waits for one second after the first request before complaining that a grace period has not yet started. This was desirable while testing the conversion from ->future_gp_needed[] to ->gp_seq_needed, but it is a bit on the hair-trigger side for production use under heavy load. This commit therefore makes this wait time be exactly that of the RCU CPU stall warning, allowing easy adjustment of both timeouts to suit the distribution or installation at hand. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cdc4fca0c4cb..7746fe1ee3fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2753,6 +2753,7 @@ static void rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -2762,8 +2763,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) return; @@ -2771,8 +2772,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -2784,18 +2785,18 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rsp->gp_req_activity + HZ) || - time_before(j, rsp->gp_activity + HZ) || + time_before(j, rsp->gp_req_activity + gpssdelay) || + time_before(j, rsp->gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", __func__, (long)READ_ONCE(rsp->gp_seq), (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, - rsp->gp_flags, rsp->name, + rsp->gp_flags, rsp->gp_state, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) -- cgit v1.2.3 From 0d805a70a652a6eef8d0283e5183879e7acb85ad Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:13 -0700 Subject: rcu: Add comment documenting how rcu_seq_snap works rcu_seq_snap may be tricky to decipher. Lets document how it works with an example to make it easier. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Shrink comment as suggested by Peter Zijlstra. ] --- kernel/rcu/rcu.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..89f13fffac73 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -91,7 +91,17 @@ static inline void rcu_seq_end(unsigned long *sp) WRITE_ONCE(*sp, rcu_seq_endval(sp)); } -/* Take a snapshot of the update side's sequence number. */ +/* + * rcu_seq_snap - Take a snapshot of the update side's sequence number. + * + * This function returns the earliest value of the grace-period sequence number + * that will indicate that a full grace period has elapsed since the current + * time. Once the grace-period sequence number has reached this value, it will + * be safe to invoke all callbacks that have been registered prior to the + * current time. This value is the current grace-period number plus two to the + * power of the number of low-order bits reserved for state, then rounded up to + * the next value in which the state bits are all zero. + */ static inline unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; -- cgit v1.2.3 From c03be752d39dc64dcfda0ac8ce87fb10b1ee5621 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:49:46 -0400 Subject: rcu: Speed up calling of RCU tasks callbacks Joel Fernandes found that the synchronize_rcu_tasks() was taking a significant amount of time. He demonstrated it with the following test: # cd /sys/kernel/tracing # while [ 1 ]; do x=1; done & # echo '__schedule_bug:traceon' > set_ftrace_filter # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m1.064s user 0m0.000s sys 0m0.004s Where it takes a little over a second to perform the synchronize, because there's a loop that waits 1 second at a time for tasks to get through their quiescent points when there's a task that must be waited for. After discussion we came up with a simple way to wait for holdouts but increase the time for each iteration of the loop but no more than a full second. With the new patch we have: # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m0.131s user 0m0.000s sys 0m0.004s Which drops it down to 13% of what the original wait time was. Link: http://lkml.kernel.org/r/20180523063815.198302-2-joel@joelfernandes.org Reported-by: Joel Fernandes (Google) Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5783bdf86e5a..4c7c49c106ee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -668,6 +668,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -749,13 +750,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * holdouts. When the list is empty, we are done. */ lastreport = jiffies; - while (!list_empty(&rcu_tasks_holdouts)) { + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { bool firstreport; bool needreport; int rtst; struct task_struct *t1; - schedule_timeout_interruptible(HZ); + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); -- cgit v1.2.3 From cd23ac8ddb7be993f88bee893b89a8b4971c3651 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:58:16 -0400 Subject: rcu: Add comment to the last sleep in the rcu tasks loop At the end of rcu_tasks_kthread() there's a lonely schedule_timeout_uninterruptible() call with no apparent rationale for its existence. But there is. It is to keep the thread from going into a tight loop if there's some anomaly. That really needs a comment. Link: http://lkml.kernel.org/r/20180524223839.GU3803@linux.vnet.ibm.com Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c7c49c106ee..39cb23d22109 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -814,6 +814,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) list = next; cond_resched(); } + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); } } -- cgit v1.2.3 From 47199a0812535217c29933cecf468568bb37f933 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2018 10:33:08 -0700 Subject: rcu: Add diagnostics for rcutorture writer stall warning This commit adds any in-the-future ->gp_seq_needed fields to the diagnostics for an rcutorture writer stall warning message. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7746fe1ee3fc..4915525559ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -606,11 +606,32 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); */ void show_rcu_gp_kthreads(void) { + int cpu; + struct rcu_data *rdp; + struct rcu_node *rnp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { pr_info("%s: wait state: %d ->state: %#lx\n", rsp->name, rsp->gp_state, rsp->gp_kthread->state); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rsp->gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); + } + } /* sched_show_task(rsp->gp_kthread); */ } } -- cgit v1.2.3 From 67abb96cbf307e16e3c6d1a0328ece085b5ce94c Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 1 Jun 2018 11:03:09 +0900 Subject: rcu: Check the range of jiffies_till_{first,next}_fqs when setting them Currently, the range of jiffies_till_{first,next}_fqs are checked and adjusted on and on in the loop of rcu_gp_kthread on runtime. However, it's enough to check them only when setting them, not every time in the loop. So make them handled on a setting time via sysfs. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4915525559ac..7498a416f63b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -510,8 +510,38 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); +static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + return ret; +} + +static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + return ret; +} + +static struct kernel_param_ops first_fqs_jiffies_ops = { + .set = param_set_first_fqs_jiffies, + .get = param_get_ulong, +}; + +static struct kernel_param_ops next_fqs_jiffies_ops = { + .set = param_set_next_fqs_jiffies, + .get = param_get_ulong, +}; + +module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644); +module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); /* @@ -2180,10 +2210,6 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle quiescent-state forcing. */ first_gp_fqs = true; j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } ret = 0; for (;;) { if (!ret) { @@ -2218,13 +2244,6 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } } else { /* Deal with stray signal. */ cond_resched_tasks_rcu_qs(); -- cgit v1.2.3 From 2ee5aca54622aacc196106c623fea4116f1043a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Jun 2018 01:22:20 -0700 Subject: rcu: Make rcu_seq_diff() more exact The current implementatation of rcu_seq_diff() follows tradition in providing a rough-and-ready approximation of the number of elapsed grace periods between the two rcu_seq values. However, this difference is used to flag RCU-failure "near misses", which can be a valuable debugging aid, so more exactitude would be an improvement. This commit therefore improves the accuracy of rcu_seq_diff(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 89f13fffac73..d5e0294b8580 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -158,7 +158,20 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) */ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) { - return (new - old) >> RCU_SEQ_CTR_SHIFT; + unsigned long rnd_diff; + + if (old == new) + return 0; + /* + * Compute the number of grace periods (still shifted up), plus + * one if either of new and old is not an exact grace period. + */ + rnd_diff = (new & ~RCU_SEQ_STATE_MASK) - + ((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK) + + ((new & RCU_SEQ_STATE_MASK) || (old & RCU_SEQ_STATE_MASK)); + if (ULONG_CMP_GE(RCU_SEQ_STATE_MASK, rnd_diff)) + return 1; /* Definitely no grace period has elapsed. */ + return ((rnd_diff - RCU_SEQ_STATE_MASK - 1) >> RCU_SEQ_CTR_SHIFT) + 2; } /* -- cgit v1.2.3 From 89b4cd4b9ebf15b01f70b85105ea5f5c6b2a3788 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jun 2018 12:29:16 -0700 Subject: rcu: Print stall-warning NMI dyntick state in hexadecimal The ->dynticks_nmi_nesting field records the nesting depth of both interrupt and NMI handlers. Because the kernel can enter interrupts and never leave them (and vice versa) and because NMIs can interrupt manipulation of the ->dynticks_nmi_nesting field, the values in this field must be both chosen and maniupated very carefully. As a result, although the value is zero when the corresponding CPU is executing neither an interrupt nor an NMI handler, it is 4,611,686,018,427,387,906 on 64-bit systems when there is a single level of interrupt/NMI handling in progress. This number is difficult to remember and interpret, so this commit switches the output to hexadecimal, resulting in the much nicer 0x4000000000000002. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2cc9bf0d363a..c1b17f5b9361 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1801,7 +1801,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -- cgit v1.2.3 From 52e17ba1d063ab6adb367f288babd380e30bad46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2018 08:54:37 -0700 Subject: srcu: Add grace-period number to rcutorture statistics printout This commit adds the SRCU grace-period number to the rcutorture statistics printout, which allows it to be compared to the rcutorture "Writer stall state" message. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e526b56998af..6c9866a854b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1268,7 +1268,8 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; -- cgit v1.2.3 From c7cd161ecb2188c07ba9560ca82aee756575359f Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:17 -0700 Subject: rcu: Assign higher prio to RCU threads if rcutorture is built-in The rcutorture RCU priority boosting tests fail even with CONFIG_RCU_BOOST set because rcutorture's threads run at the same priority as the default RCU kthreads (RT class with priority of 1). This patch checks if RCU torture is built into the kernel and if so, assigns RT priority 1 to the RCU threads, allowing the rcutorture boost tests to pass. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7498a416f63b..8143b8d40a6c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3923,12 +3923,16 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) kthread_prio = 1; else if (kthread_prio < 0) kthread_prio = 0; else if (kthread_prio > 99) kthread_prio = 99; + if (kthread_prio != kthread_prio_in) pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", kthread_prio, kthread_prio_in); -- cgit v1.2.3 From 028be12b294e3a059e6fc06852d458fdc82717ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 09:20:34 -0700 Subject: rcutorture: Change units of onoff_interval to jiffies Some RCU bugs have been sensitive to the frequency of CPU-hotplug operations, which have been gradually increased over time. But this frequency is now at the one-second lower limit that can be specified using the rcutorture.onoff_interval kernel parameter. This commit therefore changes the units of rcutorture.onoff_interval from seconds to jiffies, and also sets the value specified for this kernel parameter in the TREE03 rcutorture scenario to 200, which is 200 milliseconds for HZ=1000. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- kernel/rcu/rcutorture.c | 4 ++-- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 +- tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index efc7aa7a0670..77bd3e635313 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3632,8 +3632,8 @@ Set time (s) after boot for CPU-hotplug testing. rcutorture.onoff_interval= [KNL] - Set time (s) between CPU-hotplug operations, or - zero to disable CPU-hotplug testing. + Set time (jiffies) between CPU-hotplug operations, + or zero to disable CPU-hotplug testing. rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..eb6d4915b4e6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -87,7 +87,7 @@ torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); + "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -1889,7 +1889,7 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index b79ddb9eb9e8..5c3213cc3ad7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1,4 +1,4 @@ -rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 +rcutorture.onoff_interval=200 rcutorture.onoff_holdoff=30 rcutree.gp_preinit_delay=12 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index 24ec91041957..7bab8246392b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -39,7 +39,7 @@ rcutorture_param_onoff () { if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2" then echo CPU-hotplug kernel, adding rcutorture onoff. 1>&2 - echo rcutorture.onoff_interval=3 rcutorture.onoff_holdoff=30 + echo rcutorture.onoff_interval=1000 rcutorture.onoff_holdoff=30 fi } -- cgit v1.2.3 From 6bea2cc5a97b7e9677088b1a93e27edb74ae0e55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 15:30:36 -0700 Subject: rcu: Remove rcutorture test version and sequence number Back when RCU had a debugfs interface, there was a test version and sequence number that allowed associating debugfs data with a particular test run, where the test run started with modprobe and ended with rmmod, which was how tests were run back on the old ABAT system within IBM. But rcutorture testing no longer runs on ABAT, and there is no longer an RCU debugfs interface, so there is no longer any need for test versions and sequence numbers. This commit therefore removes the rcutorture_record_test_transition() and rcutorture_record_progress() functions, and along with them the rcutorture_testseq and rcutorture_vernum variables that they update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ---- kernel/rcu/rcutorture.c | 4 +--- kernel/rcu/tree.c | 37 ------------------------------------- 3 files changed, 1 insertion(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..0453a7d12b3f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -444,7 +444,6 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); -void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -458,7 +457,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, @@ -505,8 +503,6 @@ static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); unsigned long rcu_sched_get_gp_seq(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eb6d4915b4e6..335387fabac2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1016,7 +1016,7 @@ rcu_torture_writer(void *arg) break; } } - rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_current_version++; /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1613,7 +1613,6 @@ rcu_torture_cleanup(void) unsigned long gp_seq = 0; int i; - rcutorture_record_test_transition(); if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); @@ -1918,7 +1917,6 @@ rcu_torture_init(void) goto unwind; } } - rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..65abb399b08d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -191,18 +191,6 @@ module_param(gp_cleanup_delay, int, 0444); */ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ -/* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is @@ -622,20 +610,6 @@ void show_rcu_gp_kthreads(void) } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -/* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -664,17 +638,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - /* * Return the root node of the specified rcu_state structure. */ -- cgit v1.2.3 From 2d3625841dcee549653b6f50ffa4e6431305035a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:09:47 -0700 Subject: rcuperf: Remove unused torturing_tasks() function The torturing_tasks() function in rcuperf.c is not used, so this commit removes it. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..06eb5e42726d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,11 +369,6 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) return cur_ops->gp_diff(new, old); } -static bool __maybe_unused torturing_tasks(void) -{ - return cur_ops == &tasks_ops; -} - /* * If performance tests complete, wait for shutdown to commence. */ -- cgit v1.2.3 From 6b06aa723ed705102f3c63a494ac45352ccc0e7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 10:56:05 -0700 Subject: rcutorture: Extract common code from rcu_torture_reader() This commit extracts the code executed on each pass through the loop in rcu_torture_reader() into a new rcu_torture_one_read() function. This new function will also be used by rcu_torture_timer(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 98 +++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 335387fabac2..971e31ae9bcf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1089,6 +1089,60 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp) +{ + int idx; + unsigned long started; + unsigned long completed; + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + started = cur_ops->get_gp_seq(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + return false; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(trsp); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed = cur_ops->get_gp_seq(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, started, completed); + rcu_ftrace_dump(DUMP_ALL); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = rcutorture_seq_diff(completed, started); + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + return true; +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1165,14 +1219,8 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { - unsigned long started; - unsigned long completed; - int idx; DEFINE_TORTURE_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; struct timer_list t; - unsigned long long ts; VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -1184,44 +1232,8 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { -- cgit v1.2.3 From 8da9a59523b6608f4b21f3e489578d0993c0779f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:17:51 -0700 Subject: rcutorture: Use atomic increment for n_rcu_torture_timers Currently, rcu_torture_timer() relies on a lock to guard updates to n_rcu_torture_timers. Unfortunately, consolidating code with rcu_torture_reader() will dispense with this lock. This commit therefore makes n_rcu_torture_timers be an atomic_long_t and uses atomic_long_inc() to carry out the update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e31ae9bcf..2452e4a29923 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -151,7 +151,7 @@ static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; +static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; static atomic_long_t n_cbfloods; @@ -1160,6 +1160,7 @@ static void rcu_torture_timer(struct timer_list *unused) int pipe_count; unsigned long long ts; + atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -1177,7 +1178,6 @@ static void rcu_torture_timer(struct timer_list *unused) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); cur_ops->read_delay(&rand); - n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); pipe_count = p->rtort_pipe_count; @@ -1290,7 +1290,7 @@ rcu_torture_stats_print(void) pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", n_barrier_successes, -- cgit v1.2.3 From 3025520ec424df8b0fd5cdc319ad6b83406d9954 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:38:47 -0700 Subject: rcutorture: Use per-CPU random state for rcu_torture_timer() Currently, the rcu_torture_timer() function uses a single global torture_random_state structure protected by a single global lock. This conflicts to some extent with performance and scalability, but even more with the goal of consolidating read-side testing with rcu_torture_reader(). This commit therefore creates a per-CPU torture_random_state structure for use by rcu_torture_timer() and eliminates the lock. Signed-off-by: Paul E. McKenney [ paulmck: Make rcu_torture_timer_rand static, per 0day Test Robot report. ] --- include/linux/torture.h | 2 ++ kernel/rcu/rcutorture.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index a55e80817dae..61dfd93b6ee4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -64,6 +64,8 @@ struct torture_random_state { long trs_count; }; #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } +#define DEFINE_TORTURE_RANDOM_PERCPU(name) \ + DEFINE_PER_CPU(struct torture_random_state, name) unsigned long torture_random(struct torture_random_state *trsp); /* Task shuffler, which causes CPUs to occasionally go idle. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2452e4a29923..d5a5465d2507 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1143,6 +1143,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) return true; } +static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1154,12 +1156,12 @@ static void rcu_torture_timer(struct timer_list *unused) int idx; unsigned long started; unsigned long completed; - static DEFINE_TORTURE_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + struct torture_random_state *trsp; unsigned long long ts; + trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); @@ -1176,9 +1178,7 @@ static void rcu_torture_timer(struct timer_list *unused) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - spin_unlock(&rand_lock); + cur_ops->read_delay(trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { -- cgit v1.2.3 From 241b42522abb36c78cdc84d0cade358c4449306f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:59:31 -0700 Subject: rcutorture: Make rcu_torture_timer() use rcu_torture_one_read() This commit saves a few lines of code by making rcu_torture_timer() invoke rcu_torture_one_read(), thus completing the consolidation of code between rcu_torture_timer() and rcu_torture_reader(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d5a5465d2507..ac700aa6dcaf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1153,53 +1153,8 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); */ static void rcu_torture_timer(struct timer_list *unused) { - int idx; - unsigned long started; - unsigned long completed; - struct rcu_torture *p; - int pipe_count; - struct torture_random_state *trsp; - unsigned long long ts; - - trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { -- cgit v1.2.3 From 2397d072f76b552fc21cda19686d24a8066ced22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 07:29:25 -0700 Subject: rcutorture: Handle extended read-side critical sections This commit enables rcutorture to test whether RCU properly aggregates different types of read-side critical sections into a larger section covering the set. It does this by extending an initial read-side critical section randomly for a random number of extensions. There is a new rcu_torture_ops field ->extendable that specifies what extensions are permitted for a given flavor of RCU (for example, SRCU does not permit any extensions, while RCU-sched permits all types). Note that if a given operation (for example, local_bh_disable()) extends an RCU read-side critical section, then rcutorture feels free to also start and end the critical section with that operation's type of disabling. Disabling operations include local_bh_disable(), local_irq_disable(), and preempt_disable(). This commit also adds a new "busted_srcud" torture type, which verifies rcutorture's ability to detect extensions of RCU read-side critical sections that are not handled. Gotta test the test, after all! Note that it is not legal to invoke local_bh_disable() with interrupts disabled, and this transition is avoided by overriding the random-number generator when it wants to call local_bh_disable() while interrupts are disabled. The code instead leaves both interrupts and bh/softirq disabled in this case. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ac700aa6dcaf..f97757755207 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -62,6 +62,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +/* Bits for ->extendables field, extendables param, and related definitions. */ +#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ +#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ + RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ + /* Must be power of two minus one. */ + torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); torture_param(int, cbflood_intra_holdoff, 1, @@ -69,6 +81,8 @@ torture_param(int, cbflood_intra_holdoff, 1, torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); torture_param(int, cbflood_n_per_burst, 20000, "# callbacks per burst in flood"); +torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, + "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -277,6 +291,8 @@ struct rcu_torture_ops { void (*stats)(void); int irq_capable; int can_boost; + int extendables; + int ext_irq_conflict; const char *name; }; @@ -452,6 +468,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), + .ext_irq_conflict = RCUTORTURE_RDR_RCU, .name = "rcu_bh" }; @@ -622,6 +640,26 @@ static struct rcu_torture_ops srcud_ops = { .name = "srcud" }; +/* As above, but broken due to inappropriate reader extension. */ +static struct rcu_torture_ops busted_srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .get_gp_seq = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "busted_srcud" +}; + /* * Definitions for sched torture testing. */ @@ -660,6 +698,7 @@ static struct rcu_torture_ops sched_ops = { .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "sched" }; @@ -1089,6 +1128,110 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one extension of an RCU read-side critical section using the + * current reader state in readstate (set to zero for initial entry + * to extended critical section), set the new state as specified by + * newstate (set to zero for final exit from extended critical section), + * and random-number-generator state in trsp. If this is neither the + * beginning or end of the critical section and if there was actually a + * change, do a ->read_delay(). + */ +static void rcutorture_one_extend(int *readstate, int newstate, + struct torture_random_state *trsp) +{ + int idxnew = -1; + int idxold = *readstate; + int statesnew = ~*readstate & newstate; + int statesold = *readstate & ~newstate; + + WARN_ON_ONCE(idxold < 0); + WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + + /* First, put new protection in place to avoid critical-section gap. */ + if (statesnew & RCUTORTURE_RDR_BH) + local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_IRQ) + local_irq_disable(); + if (statesnew & RCUTORTURE_RDR_PREEMPT) + preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RCU) + idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + + /* Next, remove old protection, irq first due to bh conflict. */ + if (statesold & RCUTORTURE_RDR_IRQ) + local_irq_enable(); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_PREEMPT) + preempt_enable(); + if (statesold & RCUTORTURE_RDR_RCU) + cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + + /* Delay if neither beginning nor end and there was a change. */ + if ((statesnew || statesold) && *readstate && newstate) + cur_ops->read_delay(trsp); + + /* Update the reader state. */ + if (idxnew == -1) + idxnew = idxold & ~RCUTORTURE_RDR_MASK; + WARN_ON_ONCE(idxnew < 0); + WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); + *readstate = idxnew | newstate; + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); +} + +/* Return the biggest extendables mask given current RCU and boot parameters. */ +static int rcutorture_extend_mask_max(void) +{ + int mask; + + WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); + mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; + mask = mask | RCUTORTURE_RDR_RCU; + return mask; +} + +/* Return a random protection state mask, but with at least one bit set. */ +static int +rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +{ + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & RCUTORTURE_RDR_BH) && + (oldmask & RCUTORTURE_RDR_BH)) + mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & cur_ops->ext_irq_conflict) && + (oldmask & cur_ops->ext_irq_conflict)) + mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ + return mask ?: RCUTORTURE_RDR_RCU; +} + +/* + * Do a randomly selected number of extensions of an existing RCU read-side + * critical section. + */ +static void rcutorture_loop_extend(int *readstate, + struct torture_random_state *trsp) +{ + int i; + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ + if (!((mask - 1) & mask)) + return; /* Current RCU flavor not extendable. */ + i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; + while (i--) { + mask = rcutorture_extend_mask(*readstate, trsp); + rcutorture_one_extend(readstate, mask, trsp); + } +} + /* * Do one read-side critical section, returning false if there was * no data to read. Can be invoked both from process context and @@ -1096,14 +1239,16 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { - int idx; unsigned long started; unsigned long completed; + int newstate; struct rcu_torture *p; int pipe_count; + int readstate = 0; unsigned long long ts; - idx = cur_ops->readlock(); + newstate = rcutorture_extend_mask(readstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1113,12 +1258,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); + rcutorture_loop_extend(&readstate, trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -1139,7 +1284,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); + WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); return true; } @@ -1704,7 +1850,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, &tasks_ops, + &busted_srcud_ops, &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From bf1bef50bee13b2292929f4b86118302a3827a32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Jun 2018 08:50:09 -0700 Subject: rcutorture: Emphasize testing of single reader protection type For RCU implementations supporting multiple types of reader protection, rcutorture currently randomly selects the combinations of types of protection for each phase of each reader. The problem with this, for example, given the four kinds of protection for RCU-sched (local_irq_disable(), local_bh_disable(), preempt_disable(), and rcu_read_lock_sched()), the reader will be protected by a single mechanism only 25% of the time. We really heavier testing of single read-side mechanisms. This commit therefore uses only a single mechanism about 60% of the time, half of the time explicitly and one-eighth of the time by chance. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f97757755207..aa0be7ec2a26 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -69,6 +69,7 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett > 8; + unsigned long randmask2 = randmask1 >> 1; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + /* Half the time lots of bits, half the time only one bit. */ + if (randmask1 & 0x1) + mask = mask & randmask2; + else + mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); if ((mask & RCUTORTURE_RDR_IRQ) && !(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) -- cgit v1.2.3 From 450efca7182a516a12dfcc0311abfd242bde42b2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:43 -0700 Subject: rcutorture: Disable RT throttling for boost tests Currently rcutorture is not able to torture RCU boosting properly. This is because the rcutorture's boost threads which are doing the torturing may be throttled due to RT throttling. This patch makes rcutorture use the right torture technique (unthrottled rcutorture boost tasks) for torturing RCU so that the test fails correctly when no boost is available. Currently this requires accessing sysctl_sched_rt_runtime directly, but that should be Ok since rcutorture is test code. Such direct access is also only possible if rcutorture is used as a built-in so make it conditional on that. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aa0be7ec2a26..74e47d0a618c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "rcu.h" @@ -772,6 +773,32 @@ static void rcu_torture_boost_cb(struct rcu_head *head) smp_store_release(&rbip->inflight, 0); } +static int old_rt_runtime = -1; + +static void rcu_torture_disable_rt_throttle(void) +{ + /* + * Disable RT throttling so that rcutorture's boost threads don't get + * throttled. Only possible if rcutorture is built-in otherwise the + * user should manually do this by setting the sched_rt_period_us and + * sched_rt_runtime sysctls. + */ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime != -1) + return; + + old_rt_runtime = sysctl_sched_rt_runtime; + sysctl_sched_rt_runtime = -1; +} + +static void rcu_torture_enable_rt_throttle(void) +{ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime == -1) + return; + + sysctl_sched_rt_runtime = old_rt_runtime; + old_rt_runtime = -1; +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -1511,6 +1538,7 @@ static int rcutorture_booster_cleanup(unsigned int cpu) mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; + rcu_torture_enable_rt_throttle(); mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ @@ -1527,6 +1555,7 @@ static int rcutorture_booster_init(unsigned int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); + rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), -- cgit v1.2.3 From 3b745c8969c752601cb68c82a06735363563ab42 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:44 -0700 Subject: rcutorture: Make boost test more robust Currently, with RCU_BOOST disabled, I get no failures when forcing rcutorture to test RCU boost priority inversion. The reason seems to be that we don't check for failures if the callback never ran at all for the duration of the boost-test loop. Further, the 'rtb' and 'rtbf' counters seem to be used inconsistently. 'rtb' is incremented at the start of each test and 'rtbf' is incremented per-cpu on each failure of call_rcu. So its possible 'rtbf' > 'rtb'. To test the boost with rcutorture, I did following on a 4-CPU x86 machine: modprobe rcutorture test_boost=2 sleep 20 rmmod rcutorture With patch: rtbf: 8 rtb: 12 Without patch: rtbf: 0 rtb: 2 In summary this patch: - Increments failed and total test counters once per boost-test. - Checks for failure cases correctly. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74e47d0a618c..36b9b8266213 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -799,6 +799,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } +static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +{ + if (end - start > test_boost_duration * HZ - HZ / 2) { + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + + return true; /* failed */ + } + + return false; /* passed */ +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -819,6 +831,21 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { + /* Track if the test failed already in this test interval? */ + bool failed = false; + + /* Increment n_rcu_torture_boosts once per boost-test */ + while (!kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + if (kthread_should_stop()) + goto checkwait; + /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { @@ -837,11 +864,10 @@ static int rcu_torture_boost(void *arg) /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } + /* Check if the boost test failed */ + failed = failed || + rcu_torture_boost_failed(call_rcu_time, + jiffies); call_rcu_time = jiffies; } stutter_wait("rcu_torture_boost"); @@ -849,6 +875,14 @@ static int rcu_torture_boost(void *arg) goto checkwait; } + /* + * If boost never happened, then inflight will always be 1, in + * this case the boost check would never happen in the above + * loop so do another one here. + */ + if (!failed && smp_load_acquire(&rbi.inflight)) + rcu_torture_boost_failed(call_rcu_time, jiffies); + /* * Set the start time of the next test interval. * Yes, this is vulnerable to long delays, but such @@ -861,7 +895,6 @@ static int rcu_torture_boost(void *arg) if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; - n_rcu_torture_boosts++; mutex_unlock(&boost_mutex); break; } -- cgit v1.2.3 From 622be33fcbc93e9b672b99ed338369eb5e843ac3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:47:34 +0200 Subject: rcutorture: Use monotonic timestamp for stall detection The get_seconds() call is deprecated because it overflows on 32-bit architectures. The algorithm in rcu_torture_stall() can deal with the overflow, but another problem here is that using a CLOCK_REALTIME stamp can lead to a false-positive stall warning when a settimeofday() happens concurrently. Using ktime_get_seconds() instead avoids those issues and will never overflow. The added cast to 'unsigned long' however is necessary to make ULONG_CMP_LT() work correctly. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36b9b8266213..049b3735dba8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1622,7 +1622,7 @@ static int rcu_torture_stall(void *args) VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; + stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ rcu_read_lock(); if (stall_cpu_irqsoff) @@ -1631,7 +1631,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", smp_processor_id()); - while (ULONG_CMP_LT(get_seconds(), stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), + stop_at)) continue; /* Induce RCU CPU stall warning. */ if (stall_cpu_irqsoff) local_irq_enable(); -- cgit v1.2.3 From 4babd855fd6137f9792117eb73b096c221a49d3c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:18 -0700 Subject: rcutorture: Add support to detect if boost kthread prio is too low When rcutorture is built in to the kernel, an earlier patch detects that and raises the priority of RCU's kthreads to allow rcutorture's RCU priority boosting tests to succeed. However, if rcutorture is built as a module, those priorities must be raised manually via the rcutree.kthread_prio kernel boot parameter. If this manual step is not taken, rcutorture's RCU priority boosting tests will fail due to kthread starvation. One approach would be to raise the default priority, but that risks breaking existing users. Another approach would be to allow runtime adjustment of RCU's kthread priorities, but that introduces numerous "interesting" race conditions. This patch therefore instead detects too-low priorities, and prints a message and disables the RCU priority boosting tests in that case. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 32 ++++++++++++++++++++++++++++---- kernel/rcu/tree.c | 7 +++++++ 3 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0453a7d12b3f..bee070979970 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,7 @@ static inline void rcu_force_quiescent_state(void) { } static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } +static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); @@ -510,6 +511,7 @@ unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); +int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 049b3735dba8..e3d2d4f1d928 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1787,6 +1787,32 @@ static void rcu_torture_barrier_cleanup(void) } } +static bool rcu_torture_can_boost(void) +{ + static int boost_warn_once; + int prio; + + if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) + return false; + + prio = rcu_get_gp_kthreads_prio(); + if (!prio) + return false; + + if (prio < 2) { + if (boost_warn_once == 1) + return false; + + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " + "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " + "on the kernel command line.\n", KBUILD_MODNAME); + boost_warn_once = 1; + return false; + } + + return true; +} + static enum cpuhp_state rcutor_hp; static void @@ -1831,8 +1857,7 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) + if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); /* @@ -2056,8 +2081,7 @@ rcu_torture_init(void) test_boost_interval = 1; if (test_boost_duration < 2) test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { + if (rcu_torture_can_boost()) { boost_starttime = jiffies + test_boost_interval * HZ; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65abb399b08d..b4bcb5e21ca6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -180,6 +180,13 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* Retreive RCU kthreads priority for rcutorture */ +int rcu_get_gp_kthreads_prio(void) +{ + return kthread_prio; +} +EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); + /* * Number of grace periods between delays, normalized by the duration of * the delay. The longer the delay, the more the grace periods between -- cgit v1.2.3 From bf5b64355a3ce41752856b66c4efad4d7a88e84b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:19 -0700 Subject: rcutorture: Fix rcu_barrier successes counter The rcutorture test module currently increments both successes and error for the barrier test upon error, which results in misleading statistics being printed. This commit therefore changes the code to increment the success counter only when the test actually passes. This change was tested by by returning from the barrier callback without incrementing the callback counter, thus introducing what appeared to rcutorture to be rcu_barrier() failures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e3d2d4f1d928..bdc86cdf3b8b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -169,7 +169,7 @@ static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; -static long n_barrier_successes; +static long n_barrier_successes; /* did rcu_barrier test succeed? */ static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; @@ -1723,8 +1723,9 @@ static int rcu_torture_barrier(void *arg) atomic_read(&barrier_cbs_invoked), n_barrier_cbs); WARN_ON_ONCE(1); + } else { + n_barrier_successes++; } - n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_barrier"); @@ -1803,9 +1804,7 @@ static bool rcu_torture_can_boost(void) if (boost_warn_once == 1) return false; - pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " - "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " - "on the kernel command line.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); boost_warn_once = 1; return false; } -- cgit v1.2.3