diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 173 |
1 files changed, 146 insertions, 27 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd081987a8ec..0c47e300210a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -58,8 +58,6 @@ #include <linux/suspend.h> #include "tree.h" -#include <trace/events/rcu.h> - #include "rcu.h" MODULE_ALIAS("rcutree"); @@ -369,6 +367,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { + struct rcu_state *rsp; + struct rcu_data *rdp; + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -380,6 +381,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + do_nocb_deferred_wakeup(rdp); + } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -411,11 +416,12 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - else + rcu_eqs_enter_common(rdtp, oldval, user); + } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); + } } /** @@ -533,11 +539,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) + if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else + } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(rdtp, oldval, user); + } } /** @@ -716,7 +723,7 @@ bool rcu_lockdep_current_cpu_online(void) bool ret; if (in_nmi()) - return 1; + return true; preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; @@ -755,6 +762,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -812,16 +825,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rcu_kick_nohz_cpu(rdp->cpu); + /* + * Alternatively, the CPU might be running in the kernel + * for an extended period of time without a quiescent state. + * Attempt to force the CPU through the scheduler to gain the + * needed quiescent state, but only if the grace period has gone + * on for an uncommonly long time. If there are many stuck CPUs, + * we will beat on the first one until it gets unstuck, then move + * to the next. Only do this for the primary flavor of RCU. + */ + if (rdp->rsp == rcu_state && + ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + rdp->rsp->jiffies_resched += 5; + resched_cpu(rdp->cpu); + } + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { - unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j = jiffies; + unsigned long j1; rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); + j1 = rcu_jiffies_till_stall_check(); + rsp->jiffies_stall = j + j1; + rsp->jiffies_resched = j + j1 / 2; } /* @@ -972,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; - j = ACCESS_ONCE(jiffies); + j = jiffies; /* * Lots of memory barriers to reject false positives. @@ -1133,8 +1164,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1354,6 +1387,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1368,6 +1402,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); if (rsp->gp_flags == 0) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1386,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; + /* Record GP times before starting GP, hence smp_store_release(). */ + smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ mutex_lock(&rsp->onoff_mutex); + smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1409,6 +1445,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1463,6 +1500,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1480,6 +1518,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1505,19 +1544,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __note_gp_changes(rsp, rnp, rdp); + /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); - rsp->completed = rsp->gpnum; /* Declare grace period done. */ + /* Declare grace period done. */ + ACCESS_ONCE(rsp->completed) = rsp->gpnum; trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); @@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched(); @@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Adopt the RCU callbacks from the specified rcu_state structure's * orphanage. The caller must hold the ->orphan_lock. */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); + rcu_adopt_orphan_cbs(rsp, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2254,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp) if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; return; } rnp_old = rnp; @@ -2263,9 +2313,10 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } @@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_callbacks(rsp, rdp); + + /* Do any needed deferred wakeups of rcuo kthreads. */ + do_nocb_deferred_wakeup(rdp); } /* @@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { @@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); + offline = !__call_rcu_nocb(rdp, head, lazy, flags); WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ local_irq_restore(flags); @@ -2584,6 +2639,58 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +/** + * get_state_synchronize_rcu - Snapshot current RCU state + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_rcu(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_rcu() + * and cond_synchronize_rcu(). + */ + return smp_load_acquire(&rcu_state->gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call to + * get_state_synchronize_rcu(), just return. Otherwise, invoke + * synchronize_rcu() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_rcu(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_state->completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2757,6 +2864,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ + if (rcu_nohz_full_cpu(rsp)) + return 0; + /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -2790,6 +2901,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) { + rdp->n_rp_nocb_defer_wakeup++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2815,7 +2932,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { bool al = true; bool hc = false; @@ -3214,9 +3331,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -3346,6 +3463,8 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + rcu_fanout_leaf, nr_cpu_ids); /* * Compute number of nodes that can be handled an rcu_node tree |