diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 11:34:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 11:34:53 -0700 |
commit | 330e9e46253cbfab178450c976aa90ef0f3ae940 (patch) | |
tree | 4a9875d264a477449a9c9a0ff23937550d1c3234 /kernel | |
parent | e94693f797400b9f424c69e1da9381e4846762fe (diff) | |
parent | 567b64aaefc4ef9ae3af124ae0b13dc13a6804a8 (diff) | |
download | linux-330e9e46253cbfab178450c976aa90ef0f3ae940.tar.bz2 |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
"The sole purpose of these changes is to shrink and simplify the RCU
code base, which has suffered from creeping bloat over the past couple
of years. The end result is a net removal of ~2700 lines of code:
79 files changed, 1496 insertions(+), 4211 deletions(-)
Plus there's a marked reduction in the Kconfig space complexity as
well, here's the number of matches on 'grep RCU' in the .config:
before after
x86-defconfig 17 15
x86-allmodconfig 33 20"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (86 commits)
rcu: Remove RCU CPU stall warnings from Tiny RCU
rcu: Remove event tracing from Tiny RCU
rcu: Move RCU debug Kconfig options to kernel/rcu
rcu: Move RCU non-debug Kconfig options to kernel/rcu
rcu: Eliminate NOCBs CPU-state Kconfig options
rcu: Remove debugfs tracing
srcu: Remove Classic SRCU
srcu: Fix rcutorture-statistics typo
rcu: Remove SPARSE_RCU_POINTER Kconfig option
rcu: Remove the now-obsolete PROVE_RCU_REPEATEDLY Kconfig option
rcu: Remove typecheck() from RCU locking wrapper functions
rcu: Remove #ifdef moving rcu_end_inkernel_boot from rcupdate.h
rcu: Remove nohz_full full-system-idle state machine
rcu: Remove the RCU_KTHREAD_PRIO Kconfig option
rcu: Remove *_SLOW_* Kconfig options
srcu: Use rnp->lock wrappers to replace explicit memory barriers
rcu: Move rnp->lock wrappers for SRCU use
rcu: Convert rnp->lock wrappers to macros for SRCU use
rcu: Refactor #includes from include/linux/rcupdate.h
bcm47xx: Fix build regression
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/locking/lockdep.c | 176 | ||||
-rw-r--r-- | kernel/rcu/Kconfig | 242 | ||||
-rw-r--r-- | kernel/rcu/Kconfig.debug | 82 | ||||
-rw-r--r-- | kernel/rcu/Makefile | 2 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 277 | ||||
-rw-r--r-- | kernel/rcu/rcuperf.c | 129 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 21 | ||||
-rw-r--r-- | kernel/rcu/srcu.c | 661 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 86 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 187 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 54 | ||||
-rw-r--r-- | kernel/rcu/tiny_plugin.h | 123 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 195 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 109 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 573 | ||||
-rw-r--r-- | kernel/rcu/tree_trace.c | 494 | ||||
-rw-r--r-- | kernel/rcu/update.c | 77 | ||||
-rw-r--r-- | kernel/sched/core.c | 8 | ||||
-rw-r--r-- | kernel/time/Kconfig | 50 |
20 files changed, 1248 insertions, 2300 deletions
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0e31bfee25c..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("======================================================\n"); pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); pr_warn("------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); + pr_warn("\nwhich lock already depends on the new lock.\n\n"); + pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); pr_warn("-----------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, curr->softirqs_enabled); print_lock(next); - printk("\nand this task is already holding:\n"); + pr_warn("\nand this task is already holding:\n"); print_lock(prev); - printk("which would create a new lock dependency:\n"); + pr_warn("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(KERN_CONT " ->"); + pr_cont(" ->"); print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); + pr_cont("\n"); - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); + pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - printk("\nto a %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); + pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); + pr_warn("..."); print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - printk("\nother info that might help us debug this:\n\n"); + pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); + pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nthe dependencies between the lock to be acquired"); + pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) return 0; print_shortest_lock_dependencies(forwards_entry, next_root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { - printk("\n"); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); pr_warn("----------------------------\n"); - printk("%s/%d: ", current->comm, task_pid_nr(current)); - printk("Hash chain already cached but the contents don't match!\n"); + pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); + pr_warn("Hash chain already cached but the contents don't match!\n"); - printk("Held locks:"); + pr_warn("Held locks:"); print_chain_keys_held_locks(curr, hlock_next); - printk("Locks in cached chain:"); + pr_warn("Locks in cached chain:"); print_chain_keys_chain(chain); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } #endif @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); pr_warn("--------------------------------\n"); - printk("inconsistent {%s} -> {%s} usage.\n", + pr_warn("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, trace_softirqs_enabled(curr)); print_lock(this); - printk("{%s} state was registered at:\n", usage_str[prev_bit]); + pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_usage_bug_scenario(this); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", + pr_warn("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); /* Find a middle lock (if one exists) */ depth = get_lock_depth(other); do { if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); + pr_warn("lockdep:%s bad path found in chain graph\n", __func__); break; } middle = entry; @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) return 0; print_shortest_lock_dependencies(other, root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); pr_warn("----------------------------------\n"); - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); + pr_warn("\nbut this task is not holding:\n"); + pr_warn("%s\n", hlock->nest_lock->name); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); pr_warn("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", + pr_warn("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no more locks to release!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); pr_warn("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", + pr_warn("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no locks held!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) int unlock = 1; if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); + pr_warn("INFO: lockdep is turned off.\n"); return; } - printk("\nShowing all locks held in the system:\n"); + pr_warn("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); + pr_warn("hm, tasklist_lock locked, retrying... "); if (count) { count--; - printk(" #%d", 10-count); + pr_cont(" #%d", 10-count); mdelay(200); goto retry; } - printk(" ignoring it.\n"); + pr_cont(" ignoring it.\n"); unlock = 0; } else { if (count != 10) - printk(KERN_CONT " locked it.\n"); + pr_cont(" locked it.\n"); } do_each_thread(g, p) { @@ -4445,7 +4445,7 @@ retry: unlock = 1; } while_each_thread(g, p); - printk("\n"); + pr_warn("\n"); pr_warn("=============================================\n\n"); if (unlock) @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n"); + pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); pr_warn("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", + pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } @@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); pr_warn("-----------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("%s:%d %s!\n", file, line, s); + pr_warn("\nother info that might help us debug this:\n\n"); + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() @@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * rcu_read_lock_bh() and so on from extended quiescent states. */ if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); + pr_warn("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..be90c945063f --- /dev/null +++ b/kernel/rcu/Kconfig @@ -0,0 +1,242 @@ +# +# RCU-related configuration options +# + +menu "RCU Subsystem" + +config TREE_RCU + bool + default y if !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config PREEMPT_RCU + bool + default y if PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool + default y if !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config RCU_EXPERT + bool "Make expert-level adjustments to RCU configuration" + default n + help + This option needs to be enabled if you wish to make + expert-level adjustments to RCU configuration. By default, + no such adjustments can be made, which has the often-beneficial + side-effect of preventing "make oldconfig" from asking you all + sorts of detailed questions about how you would like numerous + obscure RCU options to be set up. + + Say Y if you need to make expert-level adjustments to RCU. + + Say N if you are unsure. + +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU + help + This option selects the full-fledged version of SRCU. + +config TASKS_RCU + bool + default n + select SRCU + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || PREEMPT_RCU ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. + + Take the default if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP && RCU_EXPERT + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug @@ -0,0 +1,82 @@ +# +# RCU-related debugging configuration options +# + +menu "RCU Debugging" + +config PROVE_RCU + def_bool PROVE_LOCKING + +config TORTURE_TEST + tristate + default n + +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to be built into + the kernel. + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_STALL_COMMON + range 3 300 + default 21 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on DEBUG_KERNEL + default y if TREE_RCU + select TRACE_CLOCK + help + This option enables additional tracepoints for ftrace-style + event tracing. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_EQS_DEBUG + bool "Provide debugging asserts for adding NO_HZ support to an arch" + depends on DEBUG_KERNEL + help + This option provides consistency checks in RCU's handling of + NO_HZ. These checks have proven quite helpful in detecting + bugs in arch-specific NO_HZ code. + + Say N here if you need ultimate kernel/user switch latencies + Say Y if you are unsure + +endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 23803c7d5180..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,13 +3,11 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_CLASSIC_SRCU) += srcu.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 73e16ec4054b..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + void rcu_early_boot_tests(void); void rcu_test_sync_prims(void); @@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) cpu <= rnp->grphi; \ cpu = cpumask_next((cpu), cpu_possible_mask)) +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + +#ifdef CONFIG_TINY_RCU +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +#else /* #ifdef CONFIG_TINY_RCU */ +void rcu_request_urgent_qs_task(struct task_struct *t); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); +void rcutorture_record_test_transition(void); +void rcutorture_record_progress(unsigned long vernum); +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#ifdef CONFIG_RCU_TRACE +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif +#endif + +#ifdef CONFIG_TINY_SRCU + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_idx; + *gpnum = *completed; +} + +#elif defined(CONFIG_TREE_SRCU) + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#endif + +#ifdef CONFIG_TINY_RCU + +/* + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. + */ +static inline unsigned long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods completed. + */ +static inline unsigned long rcu_batches_completed_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods completed. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + +static inline void show_rcu_gp_kthreads(void) +{ +} + +#else /* #ifdef CONFIG_TINY_RCU */ +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); +unsigned long srcu_batches_completed(struct srcu_struct *sp); +void show_rcu_gp_kthreads(void); +void rcu_force_quiescent_state(void); +void rcu_bh_force_quiescent_state(void); +void rcu_sched_force_quiescent_state(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#ifdef CONFIG_RCU_NOCB_CPU +bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -48,6 +48,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); @@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nreaders, 0, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, shutdown, !IS_ENABLED(MODULE), + "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); static int rcu_perf_writer_state; #define RTWS_INIT 0 -#define RTWS_EXP_SYNC 1 -#define RTWS_SYNC 2 -#define RTWS_IDLE 2 -#define RTWS_STOPPING 3 +#define RTWS_ASYNC 1 +#define RTWS_BARRIER 2 +#define RTWS_EXP_SYNC 3 +#define RTWS_SYNC 4 +#define RTWS_IDLE 5 +#define RTWS_STOPPING 6 #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -114,6 +123,8 @@ struct rcu_perf_ops { unsigned long (*started)(void); unsigned long (*completed)(void); unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); const char *name; @@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = { .started = rcu_batches_started, .completed = rcu_batches_completed, .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .name = "rcu" @@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_bh, + .gp_barrier = rcu_barrier_bh, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, .name = "rcu_bh" @@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void) return srcu_batches_completed(srcu_ctlp); } +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + static void srcu_perf_synchronize(void) { synchronize_srcu(srcu_ctlp); @@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = { .started = NULL, .completed = srcu_perf_completed, .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, .sync = srcu_perf_synchronize, .exp_sync = srcu_perf_synchronize_expedited, .name = "srcu" }; +static struct srcu_struct srcud; + +static void srcu_sync_perf_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_perf_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_perf_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_perf_init, + .cleanup = srcu_sync_perf_cleanup, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcud" +}; + /* * Definitions for sched perf testing. */ @@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = { .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_sched, + .gp_barrier = rcu_barrier_sched, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, .name = "sched" @@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = { .readunlock = tasks_perf_read_unlock, .started = rcu_no_completed, .completed = rcu_no_completed, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .name = "tasks" @@ -344,6 +404,15 @@ rcu_perf_reader(void *arg) } /* + * Callback function for asynchronous grace periods from rcu_perf_writer(). + */ +static void rcu_perf_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + +/* * RCU perf writer kthread. Repeatedly does a grace period. */ static int @@ -352,6 +421,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct rcu_head *rhp = NULL; struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; @@ -380,9 +450,27 @@ rcu_perf_writer(void *arg) } do { + if (writer_holdoff) + udelay(writer_holdoff); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_exp) { + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + rcu_perf_writer_state = RTWS_ASYNC; + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_perf_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { @@ -429,6 +517,10 @@ rcu_perf_writer(void *arg) i++; rcu_perf_wait_shutdown(); } while (!torture_must_stop()); + if (gp_async) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + } rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); @@ -452,6 +544,17 @@ rcu_perf_cleanup(void) u64 *wdp; u64 *wdpp; + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + if (torture_cleanup_begin()) return; @@ -554,7 +657,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, RCUPERF_TASKS_OPS }; @@ -624,16 +727,6 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } - if (rcu_gp_is_normal() && gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ae6e574d4cf5..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); @@ -562,31 +564,19 @@ static void srcu_torture_stats(void) int __maybe_unused cpu; int idx; -#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) #ifdef CONFIG_TREE_SRCU idx = srcu_ctlp->srcu_idx & 0x1; -#else /* #ifdef CONFIG_TREE_SRCU */ - idx = srcu_ctlp->completed & 0x1; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; -#ifdef CONFIG_TREE_SRCU struct srcu_data *counts; counts = per_cpu_ptr(srcu_ctlp->sda, cpu); u0 = counts->srcu_unlock_count[!idx]; u1 = counts->srcu_unlock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - struct srcu_array *counts; - - counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - u0 = counts->unlock_count[!idx]; - u1 = counts->unlock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -594,13 +584,8 @@ static void srcu_torture_stats(void) */ smp_rmb(); -#ifdef CONFIG_TREE_SRCU l0 = counts->srcu_lock_count[!idx]; l1 = counts->srcu_lock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - l0 = counts->lock_count[!idx]; - l1 = counts->lock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; @@ -609,7 +594,7 @@ static void srcu_torture_stats(void) pr_cont("\n"); #elif defined(CONFIG_TINY_SRCU) idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", torture_type, TORTURE_FLAG, idx, READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index dea03614263f..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney <paulmck@us.ibm.com> - * Lai Jiangshan <laijs@cn.fujitsu.com> - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include <linux/export.h> -#include <linux/mutex.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/rcupdate_wait.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/delay.h> -#include <linux/srcu.h> - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->lock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[idx]); - } - return sum; -} - -/* - * Returns approximate total of the readers' ->unlock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->unlock_count[idx]); - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be zero. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long unlocks; - - unlocks = srcu_readers_unlock_idx(sp, idx); - - /* - * Make sure that a lock is always counted if the corresponding unlock - * is counted. Needs to be a smp_mb() as the read side may contain a - * read from a variable that is written to before the synchronize_srcu() - * in the write side. In this case smp_mb()s A and B act like the store - * buffering pattern. - * - * This smp_mb() also pairs with smp_mb() C to prevent accesses after the - * synchronize_srcu() from being executed before the grace period ends. - */ - smp_mb(); /* A */ - - /* - * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does not - * mean that there are no more readers, as one could have read the - * current index but not have incremented the lock counter yet. - * - * Possible bug: There is no guarantee that there haven't been ULONG_MAX - * increments of ->lock_count[] since the unlocks were counted, meaning - * that this could return true even if there are still active readers. - * Since there are no memory barriers around srcu_flip(), the CPU is not - * required to increment ->completed before running - * srcu_readers_unlock_idx(), which means that there could be an - * arbitrarily large number of critical sections that execute after - * srcu_readers_unlock_idx() but use the old value of ->completed. - */ - return srcu_readers_lock_idx(sp, idx) == unlocks; -} - -/** - * srcu_readers_active - returns true if there are readers. and false - * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static bool srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[0]); - sum += READ_ONCE(cpuc->lock_count[1]); - sum -= READ_ONCE(cpuc->unlock_count[0]); - sum -= READ_ONCE(cpuc->unlock_count[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this only after you are finished using a given srcu_struct - * that was initialized via init_srcu_struct(). This code does some - * probabalistic checking, spotting late uses of srcu_read_lock(), - * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). - * If any such late uses are detected, the per-CPU memory associated with - * the srcu_struct is simply leaked and WARN_ON() is invoked. If the - * caller frees the srcu_struct itself, a use-after-free crash will likely - * ensue, but at least there will be a warning printed. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->completed) & 0x1; - this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->(un)lock_count[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - WRITE_ONCE(sp->completed, sp->completed + 1); - - /* - * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). - */ - smp_mb(); /* D */ /* Pairs with C. */ -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing SRCU read-side critical section. On systems with - * more than one CPU, this means that when "func()" is invoked, each CPU - * is guaranteed to have executed a full memory barrier since the end of - * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing - * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() - * but before the beginning of that SRCU read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting SRCU callback function "func()", then both CPU A and CPU - * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". - * This guarantee applies even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * Of course, these guarantees apply only for invocations of call_srcu(), - * srcu_read_lock(), and srcu_read_unlock() that are all passed the same - * srcu_struct structure. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - rcu_callback_t func) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - queue_delayed_work(system_power_efficient_wq, &sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || - lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) { - wait_for_completion(&rcu.completion); - smp_mb(); /* Caller's later accesses after GP. */ - } - -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section, - * as long as the resulting graph of srcu_structs is acyclic. - * - * There are memory-ordering constraints implied by synchronize_srcu(). - * On systems with more than one CPU, when synchronize_srcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section - * whose beginning preceded the call to synchronize_srcu(). In addition, - * each CPU having an SRCU read-side critical section that extends beyond - * the return from synchronize_srcu() is guaranteed to have executed a - * full memory barrier after the beginning of synchronize_srcu() and before - * the beginning of that SRCU read-side critical section. Note that these - * guarantees include CPUs that are offline, idle, or executing in user mode, - * as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_srcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_srcu(). This guarantee applies even if CPU A and CPU B - * are the same CPU, but again only if the system has more than one CPU. - * - * Of course, these memory-ordering guarantees apply only when - * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are - * passed the same srcu_struct structure. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that synchronize_srcu_expedited() has the same deadlock and - * memory-ordering properties as does synchronize_srcu(). - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. Note that needed memory barriers have been executed - * in this task's context by srcu_readers_active_idx_check(). - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - queue_delayed_work(system_power_efficient_wq, - &sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32798eb14853..1a1c1047d2ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_lock_nesting[0] = 0; sp->srcu_lock_nesting[1] = 0; init_swait_queue_head(&sp->srcu_wq); - sp->srcu_gp_seq = 0; - rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; sp->srcu_gp_running = false; sp->srcu_gp_waiting = false; sp->srcu_idx = 0; @@ -88,30 +88,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); flush_work(&sp->srcu_work); - WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); - WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); + WARN_ON(sp->srcu_cb_head); + WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Can be invoked from irq/bh handlers, but the matching - * __srcu_read_unlock() must be in the same handler instance. Returns an - * index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* * Removes the count for the old reader from the appropriate element of * the srcu_struct. */ @@ -133,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); void srcu_drive_gp(struct work_struct *wp) { int idx; - struct rcu_cblist ready_cbs; - struct srcu_struct *sp; + struct rcu_head *lh; struct rcu_head *rhp; + struct srcu_struct *sp; sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) return; /* Already running or nothing to do. */ - /* Tag recently arrived callbacks and wait for readers. */ + /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(sp->srcu_gp_running, true); - rcu_segcblist_accelerate(&sp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); - rcu_seq_start(&sp->srcu_gp_seq); + local_irq_disable(); + lh = sp->srcu_cb_head; + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + local_irq_enable(); idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ - rcu_seq_end(&sp->srcu_gp_seq); - - /* Update callback list based on GP, and invoke ready callbacks. */ - rcu_segcblist_advance(&sp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { - rcu_cblist_init(&ready_cbs); - local_irq_disable(); - rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); - rhp = rcu_cblist_dequeue(&ready_cbs); - for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - } - local_irq_disable(); - rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); + + /* Invoke the callbacks we removed above. */ + while (lh) { + rhp = lh; + lh = lh->next; + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); } - WRITE_ONCE(sp->srcu_gp_running, false); /* - * If more callbacks, reschedule ourselves. This can race with - * a call_srcu() at interrupt level, but the ->srcu_gp_running - * checks will straighten that out. + * Enable rescheduling, and if there are more callbacks, + * reschedule ourselves. This can race with a call_srcu() + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. */ - if (!rcu_segcblist_empty(&sp->srcu_cblist)) + WRITE_ONCE(sp->srcu_gp_running, false); + if (READ_ONCE(sp->srcu_cb_head)) schedule_work(&sp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -187,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; - head->func = func; + rhp->func = func; + rhp->next = NULL; local_irq_save(flags); - rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + *sp->srcu_cb_tail = rhp; + sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); if (!READ_ONCE(sp->srcu_gp_running)) schedule_work(&sp->srcu_work); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 157654fa436a..d0ca524bf042 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,9 +40,15 @@ #include "rcu.h" #include "rcu_segcblist.h" -ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +/* Holdoff in nanoseconds for auto-expediting. */ +#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) +static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); +/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ +static ulong counter_wrap_check = (ULONG_MAX >> 2); +module_param(counter_wrap_check, ulong, 0444); + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); @@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_init(&snp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_init(&sdp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * to each update-side SRCU primitive. Use sp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ @@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * not mean that there are no more readers, as one could have read * the current index but not have incremented the lock counter yet. * - * Possible bug: There is no guarantee that there haven't been - * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were - * counted, meaning that this could return true even if there are - * still active readers. Since there are no memory barriers around - * srcu_flip(), the CPU is not required to increment ->srcu_idx - * before running srcu_readers_unlock_idx(), which means that there - * could be an arbitrarily large number of critical sections that - * execute after srcu_readers_unlock_idx() but use the old value - * of ->srcu_idx. + * So suppose that the updater is preempted here for so long + * that more than ULONG_MAX non-nested readers come and go in + * the meantime. It turns out that this cannot result in overflow + * because if a reader modifies its unlock count after we read it + * above, then that reader's next load of ->srcu_idx is guaranteed + * to get the new value, which will cause it to operate on the + * other bank of counters, where it cannot contribute to the + * overflow of these counters. This means that there is a maximum + * of 2*NR_CPUS increments, which cannot overflow given current + * systems, especially not on 64-bit systems. + * + * OK, how about nesting? This does impose a limit on nesting + * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, + * especially on 64-bit systems. */ return srcu_readers_lock_idx(sp, idx) == unlocks; } @@ -400,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), - "Invoked srcu_gp_start() without ->gp_lock!"); + lockdep_assert_held(&sp->lock); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -489,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + int cpu; + unsigned long flags; unsigned long gpseq; int idx; int idxnext; unsigned long mask; + struct srcu_data *sdp; struct srcu_node *snp; /* Prevent more than one additional grace period. */ mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -508,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -516,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_irq(&snp->lock); + raw_spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -526,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq(&snp->lock); - if (cbs) { - smp_mb(); /* GP end before CB invocation. */ + raw_spin_unlock_irq_rcu_node(snp); + if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); - } + + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + sdp = per_cpu_ptr(sp->sda, cpu); + raw_spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + } } /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); } } @@ -567,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -600,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { - smp_mb(); /* CBs after GP! */ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -622,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -645,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -671,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) */ static void srcu_flip(struct srcu_struct *sp) { + /* + * Ensure that if this updater saw a given reader's increment + * from __srcu_read_lock(), that reader was using an old value + * of ->srcu_idx. Also ensure that if a given reader sees the + * new value of ->srcu_idx, this updater's earlier scans cannot + * have seen that reader's increments (which is OK, because this + * grace period need not wait on that reader). + */ + smp_mb(); /* E */ /* Pairs with B and C. */ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); /* @@ -745,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) } /* + * SRCU callback function to leak a callback. + */ +static void srcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating * grace-period processing if it is not already running. @@ -782,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, struct srcu_data *sdp; check_init_srcu_struct(sp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ + WRITE_ONCE(rhp->func, srcu_leak_callback); + WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); + return; + } rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - spin_lock(&sdp->lock); + raw_spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -799,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) srcu_funnel_exp_start(sp, sdp->mynode, s); } +/** + * call_srcu() - Queue a callback for invocation after an SRCU grace period + * @sp: srcu_struct in queue the callback + * @head: structure to be used for queueing the SRCU callback. + * @func: function to be invoked after the SRCU grace period + * + * The callback function will be invoked some time after a full SRCU + * grace period elapses, in other words after all pre-existing SRCU + * read-side critical sections have completed. However, the callback + * function might well execute concurrently with other SRCU read-side + * critical sections that started after call_srcu() was invoked. SRCU + * read-side critical sections are delimited by srcu_read_lock() and + * srcu_read_unlock(), and may be nested. + * + * The callback will be invoked from process context, but must nevertheless + * be fast and must not block. + */ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { @@ -953,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) + &sdp->srcu_barrier_head, 0)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); - spin_unlock_irq(&sdp->lock); + } + raw_spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1008,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1067,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - spin_lock_irq(&sdp->lock); - smp_mb(); /* Old grace periods before callback invocation! */ + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + debug_rcu_head_unqueue(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1092,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1111,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1121,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); @@ -1152,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Hierarchical SRCU implementation.\n"); + if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) + pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + return 0; +} +early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index e5385731e391..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,15 +35,26 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/trace_events.h> #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ -struct rcu_ctrlblk; -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp); +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; #include "tiny_plugin.h" @@ -59,19 +70,6 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL(rcu_barrier_sched); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool notrace __rcu_is_watching(void) -{ - return true; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers @@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -125,7 +122,6 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -140,10 +136,8 @@ void rcu_check_callbacks(int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(rn, list); + __rcu_reclaim("", list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) @@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) - rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 371034e77f87..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -22,36 +22,6 @@ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -#include <linux/kthread.h> -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include <linux/kernel_stat.h> @@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void) } #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutiny_trace_init); - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = READ_ONCE(rcp->jiffies_stall); - if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - WRITE_ONCE(rcp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - } else if (ULONG_CMP_GE(j, js)) { - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); - } -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e354e475e645..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ -#ifdef CONFIG_RCU_KTHREAD_PRIO -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; -module_param(gp_preinit_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ -static const int gp_preinit_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT -static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; -module_param(gp_init_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -static const int gp_init_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; -module_param(gp_cleanup_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ -static const int gp_cleanup_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static int gp_preinit_delay; +module_param(gp_preinit_delay, int, 0444); +static int gp_init_delay; +module_param(gp_init_delay, int, 0444); +static int gp_cleanup_delay; +module_param(gp_cleanup_delay, int, 0444); /* * Number of grace periods between delays, normalized by the duration of @@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -265,6 +248,7 @@ void rcu_sched_qs(void) void rcu_bh_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), @@ -286,10 +270,6 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; /* @@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt) barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(); + rcu_preempt_note_context_switch(preempt); /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) goto out; @@ -534,9 +514,12 @@ void rcu_all_qs(void) } EXPORT_SYMBOL_GPL(rcu_all_qs); -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +static long blimit = DEFAULT_RCU_BLIMIT; +#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +static long qhimark = DEFAULT_RCU_QHIMARK; +#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +static long qlowmark = DEFAULT_RCU_QLOMARK; module_param(blimit, long, 0444); module_param(qhimark, long, 0444); @@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); return READ_ONCE(*fp); } @@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -864,7 +847,6 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -914,7 +896,6 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); rdtp->dynticks_nesting--; } - rcu_sysidle_enter(1); } /* @@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -995,7 +977,6 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -1047,7 +1028,6 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(oldval, true); - rcu_sysidle_exit(1); } /* @@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void) } /** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - -/** * rcu_is_watching - see if RCU thinks that the current CPU is idle * - * If the current CPU is in its idle loop and is neither in an interrupt + * Return true if RCU is watching the running CPU, which means that this + * CPU can safely enter RCU read-side critical sections. In other words, + * if the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ bool notrace rcu_is_watching(void) @@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = __rcu_is_watching(); + ret = !rcu_dynticks_curr_cpu_in_eqs(); preempt_enable_notrace(); return ret; } @@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, @@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; bool *rnhqp; @@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { + lockdep_assert_held(&rnp->lock); + /* * If RCU is idle, we just wait for the next grace period. * But we can only be sure that RCU is idle if we are looking @@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + lockdep_assert_held(&rnp->lock); + /* * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. @@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; + lockdep_assert_held(&rnp->lock); + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && !unlikely(READ_ONCE(rdp->gpwrap))) { @@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) */ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - bool isidle = false; - unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = true; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); + force_qs_rnp(rsp, dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = true; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2341,6 +2308,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + lockdep_assert_held(&rcu_get_root(rsp)->lock); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; + lockdep_assert_held(&rnp->lock); + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { @@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; + lockdep_assert_held(&rnp->lock); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2599,6 +2571,8 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs do not have orphanable callbacks. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; @@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs are handled specially. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) { int cpu; unsigned long flags; @@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + if (f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } } @@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ + /* + * Probable double call_rcu(), so leak the callback. + * Use rcu:rcu_callback trace event to find the previous + * time callback was passed to __call_rcu(). + */ + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } head->func = func; @@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, local_irq_restore(flags); } -/* - * Queue an RCU-sched callback for invocation after a grace period. +/** + * call_rcu_sched() - Queue an RCU for invocation after sched grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_sched() assumes + * that the read-side critical sections end on enabling of preemption + * or on voluntary preemption. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. + * + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { @@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * Queue an RCU callback for invocation after a quicker grace period. +/** + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. + * OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { @@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. */ void synchronize_sched(void) { @@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); + rdp->barrier_head.func = rcu_barrier_callback; + debug_rcu_head_queue(&rdp->barrier_head); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + atomic_inc(&rsp->barrier_cpu_count); + } else { + debug_rcu_head_unqueue(&rdp->barrier_head); + _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + } } /* @@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ba38262c3554..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -45,14 +45,6 @@ struct rcu_dynticks { bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -160,19 +152,6 @@ struct rcu_node { /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -312,9 +291,9 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOGP_WAKE_NOT 0 -#define RCU_NOGP_WAKE 1 -#define RCU_NOGP_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_NOT 0 +#define RCU_NOCB_WAKE 1 +#define RCU_NOCB_WAKE_FORCE 2 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(void); +static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); @@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(int irq); -static void rcu_sysidle_exit(int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); @@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ #endif /* #ifndef RCU_TREE_NONCORE */ - -#ifdef CONFIG_RCU_TRACE -/* Read out queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ -#ifdef CONFIG_RCU_NOCB_CPU - *ql = atomic_long_read(&rdp->nocb_q_count); - *qll = atomic_long_read(&rdp->nocb_q_count_lazy); -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - *ql = 0; - *qll = 0; -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -} -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Wrappers for the rcu_node::lock acquire and release. - * - * Because the rcu_nodes form a tree, the tree traversal locking will observe - * different lock values, this in turn means that an UNLOCK of one level - * followed by a LOCK of another level does not imply a full memory barrier; - * and most importantly transitivity is lost. - * - * In order to restore full ordering between tree levels, augment the regular - * lock acquire functions with smp_mb__after_unlock_lock(). - * - * As ->lock of struct rcu_node is a __private field, therefore one should use - * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. - */ -static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); -} - -static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); -} - -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) - -static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) -{ - bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); - - if (locked) - smp_mb__after_unlock_lock(); - return locked; -} diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e513b4ab1197..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * * Caller must hold the rcu_state's exp_mutex. */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (gp_preinit_delay) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (gp_init_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (gp_cleanup_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU @@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -286,12 +313,14 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { @@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } @@ -835,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2023,6 +2054,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); @@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 6cea17a1ea30..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Read-Copy Update tracing for hierarchical implementation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright IBM Corporation, 2008 - * Author: Paul E. McKenney - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> -#include <linux/prefetch.h> - -#define RCU_TREE_NONCORE -#include "tree.h" -#include "rcu.h" - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d bseq: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->barrier_sequence); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), - rdp->core_needs_qs); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - rcu_dynticks_snap(rdp->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); - ql += rcu_segcblist_n_cbs(&rdp->cblist); - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], - ".R"[!rcu_segcblist_segempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)], - ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], - ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - int cpu; - struct rcu_state *rsp = (struct rcu_state *)m->private; - struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->exp_workdone0); - s1 += atomic_long_read(&rdp->exp_workdone1); - s2 += atomic_long_read(&rdp->exp_workdone2); - s3 += atomic_long_read(&rdp->exp_workdone3); - } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, - atomic_read(&rsp->expedited_need_qs), - rsp->expedited_sequence / 2); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->gp_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), - rsp->orphan_done.len_lazy, - rsp->orphan_done.len); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - completed = READ_ONCE(rsp->completed); - gpnum = READ_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_core_needs_qs, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_nocb_defer_wakeup, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 273e869ca21d..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,7 +62,9 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU +extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); +extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); @@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; + int j; /* Initialize and register callbacks for each flavor specified. */ for (i = 0; i < n; i++) { @@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } init_rcu_head_on_stack(&rs_array[i].head); init_completion(&rs_array[i].completion); - (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); } /* Wait for all callbacks to be invoked. */ @@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, (crcu_array[i] == call_rcu || crcu_array[i] == call_rcu_bh)) continue; - wait_for_completion(&rs_array[i].completion); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + wait_for_completion(&rs_array[i].completion); destroy_rcu_head_on_stack(&rs_array[i].head); } } @@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; -/* - * Post an RCU-tasks callback. First call must be from process context - * after the scheduler if fully operational. +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { @@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PROVE_RCU /* @@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests); #else void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any significant non-default boot-time settings. + */ +void __init rcupdate_announce_bootup_oddness(void) +{ + if (rcu_normal) + pr_info("\tNo expedited grace period (rcu_normal).\n"); + else if (rcu_normal_after_boot) + pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); + else if (rcu_expedited) + pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); + if (rcu_cpu_stall_suppress) + pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); + if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) + pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); + rcu_tasks_bootup_oddness(); +} + +#endif /* #ifndef CONFIG_TINY_RCU */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 326d4f88e2b1..5b60f3a8343f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5874,15 +5874,9 @@ int sched_cpu_deactivate(unsigned int cpu) * users of this state to go away such that all new such users will * observe it. * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * * Do sync before park smpboot threads to take care the rcu boost case. */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + synchronize_rcu_mult(call_rcu, call_rcu_sched); if (!sched_smp_initialized) return 0; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. -config NO_HZ_FULL_SYSIDLE - bool "Detect full-system idle state for full dynticks system" - depends on NO_HZ_FULL - default n - help - At least one CPU must keep the scheduling-clock tick running for - timekeeping purposes whenever there is a non-idle CPU, where - "non-idle" also includes dynticks CPUs as long as they are - running non-idle tasks. Because the underlying adaptive-tick - support cannot distinguish between all CPUs being idle and - all CPUs each running a single task in dynticks mode, the - underlying support simply ensures that there is always a CPU - handling the scheduling-clock tick, whether or not all CPUs - are idle. This Kconfig option enables scalable detection of - the all-CPUs-idle state, thus allowing the scheduling-clock - tick to be disabled when all CPUs are idle. Note that scalable - detection of the all-CPUs-idle state means that larger systems - will be slower to declare the all-CPUs-idle state. - - Say Y if you would like to help debug all-CPUs-idle detection. - - Say N if you are unsure. - -config NO_HZ_FULL_SYSIDLE_SMALL - int "Number of CPUs above which large-system approach is used" - depends on NO_HZ_FULL_SYSIDLE - range 1 NR_CPUS - default 8 - help - The full-system idle detection mechanism takes a lazy approach - on large systems, as is required to attain decent scalability. - However, on smaller systems, scalability is not anywhere near as - large a concern as is energy efficiency. The sysidle subsystem - therefore uses a fast but non-scalable algorithm for small - systems and a lazier but scalable algorithm for large systems. - This Kconfig parameter defines the number of CPUs in the largest - system that will be considered to be "small". - - The default value will be fine in most cases. Battery-powered - systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger - numbers of CPUs, and (3) are suffering from battery-lifetime - problems due to long sysidle latencies might wish to experiment - with larger values for this Kconfig parameter. On the other - hand, they might be even better served by disabling NO_HZ_FULL - entirely, given that NO_HZ_FULL is intended for HPC and - real-time workloads that at present do not tend to be run on - battery-powered systems. - - Take the default if you are unsure. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |