diff options
Diffstat (limited to 'kernel')
104 files changed, 3072 insertions, 1337 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ea2ee1181921..f3a2abd6d1a1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1959,6 +1959,12 @@ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); + if (ctx->dummy) { + if (ctx->context != AUDIT_CTX_URING) + return; + goto out; + } + if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9390bfd9f1cd..71a418858a5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3390,8 +3390,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config new file mode 100644 index 000000000000..dcd86f32f4ed --- /dev/null +++ b/kernel/configs/x86_debug.config @@ -0,0 +1,18 @@ +CONFIG_X86_DEBUG_FPU=y +CONFIG_LOCK_STAT=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y +CONFIG_KMEMCHECK=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_GCOV_KERNEL=y +CONFIG_LOCKDEP=y +CONFIG_PROVE_LOCKING=y +CONFIG_SCHEDSTATS=y +CONFIG_VMLINUX_VALIDATION=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/kernel/cpu.c b/kernel/cpu.c index d0a9aa0b42e8..bbad5e375d3b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -35,6 +35,7 @@ #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> #include <linux/random.h> +#include <linux/cc_platform.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -716,14 +717,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, /* * The cpu hotplug threads manage the bringup and teardown of the cpus */ -static void cpuhp_create(unsigned int cpu) -{ - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - - init_completion(&st->done_up); - init_completion(&st->done_down); -} - static int cpuhp_should_run(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); @@ -883,15 +876,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu) static struct smp_hotplug_thread cpuhp_threads = { .store = &cpuhp_state.thread, - .create = &cpuhp_create, .thread_should_run = cpuhp_should_run, .thread_fn = cpuhp_thread_fun, .thread_comm = "cpuhp/%u", .selfparking = true, }; +static __init void cpuhp_init_state(void) +{ + struct cpuhp_cpu_state *st; + int cpu; + + for_each_possible_cpu(cpu) { + st = per_cpu_ptr(&cpuhp_state, cpu); + init_completion(&st->done_up); + init_completion(&st->done_down); + } +} + void __init cpuhp_threads_init(void) { + cpuhp_init_state(); BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); kthread_unpark(this_cpu_read(cpuhp_state.thread)); } @@ -1186,6 +1191,12 @@ out: static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { + /* + * If the platform does not support hotplug, report it explicitly to + * differentiate it from a transient offlining failure. + */ + if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED)) + return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; return _cpu_down(cpu, 0, target); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 256cf6db573c..4d57c03714f4 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -243,9 +243,8 @@ static int __init __parse_crashkernel(char *cmdline, *crash_base = 0; ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -EINVAL; + return -ENOENT; ck_cmdline += strlen(name); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index da06a5553835..7beceb447211 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -53,6 +53,7 @@ #include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> +#include <linux/security.h> #include <asm/cacheflush.h> #include <asm/byteorder.h> @@ -752,6 +753,29 @@ cpu_master_loop: continue; kgdb_connected = 0; } else { + /* + * This is a brutal way to interfere with the debugger + * and prevent gdb being used to poke at kernel memory. + * This could cause trouble if lockdown is applied when + * there is already an active gdb session. For now the + * answer is simply "don't do that". Typically lockdown + * *will* be applied before the debug core gets started + * so only developers using kgdb for fairly advanced + * early kernel debug can be biten by this. Hopefully + * they are sophisticated enough to take care of + * themselves, especially with help from the lockdown + * message printed on the console! + */ + if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) { + if (IS_ENABLED(CONFIG_KGDB_KDB)) { + /* Switch back to kdb if possible... */ + dbg_kdb_mode = 1; + continue; + } else { + /* ... otherwise just bail */ + break; + } + } error = gdb_serial_stub(ks); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0852a537dad4..ead4da947127 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -45,6 +45,7 @@ #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/security.h> #include "kdb_private.h" #undef MODULE_PARAM_PREFIX @@ -166,10 +167,62 @@ struct task_struct *kdb_curr_task(int cpu) } /* - * Check whether the flags of the current command and the permissions - * of the kdb console has allow a command to be run. + * Update the permissions flags (kdb_cmd_enabled) to match the + * current lockdown state. + * + * Within this function the calls to security_locked_down() are "lazy". We + * avoid calling them if the current value of kdb_cmd_enabled already excludes + * flags that might be subject to lockdown. Additionally we deliberately check + * the lockdown flags independently (even though read lockdown implies write + * lockdown) since that results in both simpler code and clearer messages to + * the user on first-time debugger entry. + * + * The permission masks during a read+write lockdown permits the following + * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE). + * + * The INSPECT commands are not blocked during lockdown because they are + * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes + * forcing them to have no arguments) and lsmod. These commands do expose + * some kernel state but do not allow the developer seated at the console to + * choose what state is reported. SIGNAL and REBOOT should not be controversial, + * given these are allowed for root during lockdown already. + */ +static void kdb_check_for_lockdown(void) +{ + const int write_flags = KDB_ENABLE_MEM_WRITE | + KDB_ENABLE_REG_WRITE | + KDB_ENABLE_FLOW_CTRL; + const int read_flags = KDB_ENABLE_MEM_READ | + KDB_ENABLE_REG_READ; + + bool need_to_lockdown_write = false; + bool need_to_lockdown_read = false; + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags)) + need_to_lockdown_write = + security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL); + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags)) + need_to_lockdown_read = + security_locked_down(LOCKDOWN_DBG_READ_KERNEL); + + /* De-compose KDB_ENABLE_ALL if required */ + if (need_to_lockdown_write || need_to_lockdown_read) + if (kdb_cmd_enabled & KDB_ENABLE_ALL) + kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL; + + if (need_to_lockdown_write) + kdb_cmd_enabled &= ~write_flags; + + if (need_to_lockdown_read) + kdb_cmd_enabled &= ~read_flags; +} + +/* + * Check whether the flags of the current command, the permissions of the kdb + * console and the lockdown state allow a command to be run. */ -static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, +static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, bool no_args) { /* permissions comes from userspace so needs massaging slightly */ @@ -1180,6 +1233,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_curr_task(raw_smp_processor_id()); KDB_DEBUG_STATE("kdb_local 1", reason); + + kdb_check_for_lockdown(); + kdb_go_count = 0; if (reason == KDB_REASON_DEBUG) { /* special case below */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 93c3b86e781c..032f164abe7c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,7 +17,7 @@ /* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - arch_check_user_regs(regs); + arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); CT_WARN_ON(ct_state() != CONTEXT_USER); @@ -126,7 +126,7 @@ static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); user_enter_irqoff(); @@ -416,7 +416,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); rcu_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -465,7 +465,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) ftrace_nmi_exit(); if (irq_state.lockdep) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } instrumentation_end(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 23bb19716ad3..950b25c3f210 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6247,7 +6247,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } @@ -6428,8 +6428,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) @@ -12217,6 +12217,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -12282,6 +12285,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } } @@ -12298,7 +12302,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 082832738c8f..5150d5f84c03 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 52868716ec35..fb35b926024c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -859,11 +859,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6418083901d4..a9bc3c98f76a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1143,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) diff --git a/kernel/fork.c b/kernel/fork.c index 9796897560ab..254ab63c1106 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm) mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); + mm_pasid_drop(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1045,6 +1046,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_CPU_SUP_INTEL + tsk->reported_split_lock = 0; +#endif + return tsk; free_stack: @@ -1190,7 +1196,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - mm_pasid_drop(mm); mmdrop(mm); } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 183b28c32c83..ce2889f12375 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1005,7 +1005,7 @@ retry_private: rt_mutex_init_waiter(&rt_waiter); /* - * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not * hold it while doing rt_mutex_start_proxy(), because then it will * include hb->lock in the blocking chain, even through we'll not in * fact hold it while blocking. This will lead it to report -EDEADLK diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..02a65d554340 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { + printk_prefer_direct_enter(); + if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", @@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; + + printk_prefer_direct_exit(); } touch_nmi_watchdog(); @@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) + if (hung_task_show_lock) { + printk_prefer_direct_enter(); debug_show_all_locks(); + printk_prefer_direct_exit(); + } if (hung_task_show_all_bt) { hung_task_show_all_bt = false; + printk_prefer_direct_enter(); trigger_all_cpu_backtrace(); + printk_prefer_direct_exit(); } if (hung_task_call_panic) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index fdf170404650..d9a5c1d65a79 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -258,7 +258,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, nodemask_t nodemsk = NODE_MASK_NONE; struct node_vectors *node_vectors; - if (!cpumask_weight(cpu_mask)) + if (cpumask_empty(cpu_mask)) return 0; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 54af0deb239b..e6b8e564b37f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1573,17 +1573,12 @@ static struct device *irq_get_parent_device(struct irq_data *data) int irq_chip_pm_get(struct irq_data *data) { struct device *dev = irq_get_parent_device(data); - int retval; + int retval = 0; - if (IS_ENABLED(CONFIG_PM) && dev) { - retval = pm_runtime_get_sync(dev); - if (retval < 0) { - pm_runtime_put_noidle(dev); - return retval; - } - } + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_resume_and_get(dev); - return 0; + return retval; } /** diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 2b43f5f5033d..bc8e40cf2b65 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), + BIT_MASK_DESCR(IRQCHIP_IMMUTABLE), }; static void diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99cbdf55a8bd..f09c60393e55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -29,12 +29,14 @@ extern struct irqaction chained_action; * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded + * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, + IRQTF_READY, }; /* diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 0cd02efa3a74..dd76323ea3fd 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, goto err_free_bitmap; work_ctx->irq_count = num_irqs; - init_irq_work(&work_ctx->work, irq_sim_handle_irq); + work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); return work_ctx->domain; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 939d21cd55c3..d323b180b0f3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); + init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); @@ -575,6 +576,7 @@ int __init early_irq_init(void) raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); mutex_init(&desc[i].request_mutex); + init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -699,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_hardirq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c03f71d5ec10..8c396319d5ac 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -222,11 +222,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, { struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); + const struct cpumask *prog_mask; int ret; + static DEFINE_RAW_SPINLOCK(tmp_mask_lock); + static struct cpumask tmp_mask; + if (!chip || !chip->irq_set_affinity) return -EINVAL; + raw_spin_lock(&tmp_mask_lock); /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with @@ -248,24 +253,34 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { - const struct cpumask *hk_mask, *prog_mask; - - static DEFINE_RAW_SPINLOCK(tmp_mask_lock); - static struct cpumask tmp_mask; + const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); - raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = &tmp_mask; - ret = chip->irq_set_affinity(data, prog_mask, force); - raw_spin_unlock(&tmp_mask_lock); } else { - ret = chip->irq_set_affinity(data, mask, force); + prog_mask = mask; } + + /* + * Make sure we only provide online CPUs to the irqchip, + * unless we are being asked to force the affinity (in which + * case we do as we are told). + */ + cpumask_and(&tmp_mask, prog_mask, cpu_online_mask); + if (!force && !cpumask_empty(&tmp_mask)) + ret = chip->irq_set_affinity(data, &tmp_mask, force); + else if (force) + ret = chip->irq_set_affinity(data, mask, force); + else + ret = -EINVAL; + + raw_spin_unlock(&tmp_mask_lock); + switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: @@ -1249,6 +1264,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) } /* + * Internal function to notify that a interrupt thread is ready. + */ +static void irq_thread_set_ready(struct irq_desc *desc, + struct irqaction *action) +{ + set_bit(IRQTF_READY, &action->thread_flags); + wake_up(&desc->wait_for_threads); +} + +/* + * Internal function to wake up a interrupt thread and wait until it is + * ready. + */ +static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, + struct irqaction *action) +{ + if (!action || !action->thread) + return; + + wake_up_process(action->thread); + wait_event(desc->wait_for_threads, + test_bit(IRQTF_READY, &action->thread_flags)); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -1259,6 +1299,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irq_thread_set_ready(desc, action); + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, @@ -1683,8 +1725,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - init_waitqueue_head(&desc->wait_for_threads); - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, @@ -1780,14 +1820,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_setup_timings(desc, new); - /* - * Strictly no need to wake it up, but hung_task complains - * when no hard interrupt wakes the thread up. - */ - if (new->thread) - wake_up_process(new->thread); - if (new->secondary) - wake_up_process(new->secondary->thread); + wake_up_and_wait_for_irq_thread_ready(desc, new); + wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index bbfb26489aa1..1698e77645ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -286,7 +286,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) { - unsigned int bit, cpu, end = m->alloc_end; + unsigned int bit, cpu, end; struct cpumap *cm; if (cpumask_empty(msk)) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2bdfce5edafd..a9ee535293eb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag irqd_clr_can_reserve(irqd); if (vflags & VIRQ_NOMASK_QUIRK) irqd_set_msi_nomask_quirk(irqd); + + /* + * If the interrupt is managed but no CPU is available to + * service it, shut it down until better times. Note that + * we only do this on the !RESERVE path as x86 (the only + * architecture using this flag) deals with this in a + * different way by using a catch-all vector. + */ + if ((vflags & VIRQ_ACTIVATE) && + irqd_affinity_is_managed(irqd) && + !cpumask_intersects(irq_data_get_affinity_mask(irqd), + cpu_online_mask)) { + irqd_set_managed_shutdown(irqd); + return 0; + } } if (!(vflags & VIRQ_ACTIVATE)) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a36fca063a73..dcec1b743c69 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc) else nthreads *= 2; - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + if (!preempt_model_preemptible() || + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { /* * Without any preemption, keep 2 CPUs free for other tasks, one * of which is the main test case function checking for * completion or failure. */ - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const long min_unused_cpus = preempt_model_none() ? 2 : 0; const long min_required_cpus = 2 + min_unused_cpus; if (num_online_cpus() < min_required_cpus) { @@ -1565,14 +1566,6 @@ static void test_exit(struct kunit *test) torture_cleanup_end(); } -static struct kunit_suite kcsan_test_suite = { - .name = "kcsan", - .test_cases = kcsan_test_cases, - .init = test_init, - .exit = test_exit, -}; -static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; - __no_kcsan static void register_tracepoints(struct tracepoint *tp, void *ignore) { @@ -1588,11 +1581,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore) tracepoint_probe_unregister(tp, probe_console, NULL); } -/* - * We only want to do tracepoints setup and teardown once, therefore we have to - * customize the init and exit functions and cannot rely on kunit_test_suite(). - */ -static int __init kcsan_test_init(void) +static int kcsan_suite_init(struct kunit_suite *suite) { /* * Because we want to be able to build the test as a module, we need to @@ -1600,18 +1589,25 @@ static int __init kcsan_test_init(void) * won't work here. */ for_each_kernel_tracepoint(register_tracepoints, NULL); - return __kunit_test_suites_init(kcsan_test_suites); + return 0; } -static void kcsan_test_exit(void) +static void kcsan_suite_exit(struct kunit_suite *suite) { - __kunit_test_suites_exit(kcsan_test_suites); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); } -late_initcall_sync(kcsan_test_init); -module_exit(kcsan_test_exit); +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kcsan_suite_init, + .suite_exit = kcsan_suite_exit, +}; + +kunit_test_suites(&kcsan_test_suite); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..be4b54c2c615 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1078,7 +1078,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + elf_core_copy_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index dbe57df2e199..dd58c0be9ce2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2126,7 +2126,7 @@ static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, struct kprobe_ctlblk *kcb; /* The data must NOT be null. This means rethook data structure is broken. */ - if (WARN_ON_ONCE(!data)) + if (WARN_ON_ONCE(!data) || !rp->handler) return; __this_cpu_write(current_kprobe, &rp->kp); diff --git a/kernel/kthread.c b/kernel/kthread.c index 50265f69a135..544fd4097406 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1522,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void) } return NULL; } -EXPORT_SYMBOL(kthread_blkcg); #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c06cab6546ed..a6e671b8608d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -60,7 +60,6 @@ #include "lockdep_internals.h" -#define CREATE_TRACE_POINTS #include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING @@ -1380,7 +1379,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, u16 distance, u8 dep, + u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -3133,19 +3132,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, - calc_dep(prev, next), - *trace); + &hlock_class(prev)->locks_after, distance, + calc_dep(prev, next), *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, - calc_depb(prev, next), - *trace); + &hlock_class(next)->locks_before, distance, + calc_depb(prev, next), *trace); if (!ret) return 0; @@ -4236,14 +4231,13 @@ static void __trace_hardirqs_on_caller(void) /** * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts - * @ip: Caller address * * Invoked before a possible transition to RCU idle from exit to user or * guest mode. This ensures that all RCU operations are done before RCU * stops watching. After the RCU transition lockdep_hardirqs_on() has to be * invoked to set the final state. */ -void lockdep_hardirqs_on_prepare(unsigned long ip) +void lockdep_hardirqs_on_prepare(void) { if (unlikely(!debug_locks)) return; @@ -4840,8 +4834,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static void print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) + struct held_lock *hlock) { if (!debug_locks_off()) return; @@ -5017,7 +5010,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { - print_lock_nested_lock_not_held(curr, hlock, ip); + print_lock_nested_lock_not_held(curr, hlock); return 0; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e3585950ec8..d973fe6041bf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,6 +30,9 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + #ifndef CONFIG_PREEMPT_RT #include "mutex.h" @@ -599,12 +602,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); + trace_contention_end(lock, 0); preempt_enable(); return 0; } @@ -641,6 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } set_current_state(state); + trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { bool first; @@ -680,10 +686,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock_or_handoff(lock, first) || - (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) + if (__mutex_trylock_or_handoff(lock, first)) break; + if (first) { + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + break; + trace_contention_begin(lock, LCB_F_MUTEX); + } + raw_spin_lock(&lock->wait_lock); } raw_spin_lock(&lock->wait_lock); @@ -707,6 +719,7 @@ acquired: skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); + trace_contention_end(lock, 0); if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); @@ -719,6 +732,7 @@ err: __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + trace_contention_end(lock, ret); raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index c9fdae94e098..5fe4c5495ba3 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -9,6 +9,7 @@ #include <linux/sched/task.h> #include <linux/sched/debug.h> #include <linux/errno.h> +#include <trace/events/lock.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *key) @@ -171,9 +172,11 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) if (try) return false; + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); percpu_rwsem_wait(sem, /* .reader = */ true); preempt_disable(); + trace_contention_end(sem, 0); return true; } @@ -216,6 +219,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); @@ -237,6 +241,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) /* Wait for all active readers to complete. */ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); + trace_contention_end(sem, 0); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index ec36b73f4733..2e1600906c9f 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,10 +12,11 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> +#include <trace/events/lock.h> /** - * queued_read_lock_slowpath - acquire read lock of a queue rwlock - * @lock: Pointer to queue rwlock structure + * queued_read_lock_slowpath - acquire read lock of a queued rwlock + * @lock: Pointer to queued rwlock structure */ void queued_read_lock_slowpath(struct qrwlock *lock) { @@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock) } atomic_sub(_QR_BIAS, &lock->cnts); + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ); + /* * Put the reader into the wait queue */ @@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * Signal the next one in queue to become queue head */ arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queued_write_lock_slowpath - acquire write lock of a queue rwlock - * @lock : Pointer to queue rwlock structure + * queued_write_lock_slowpath - acquire write lock of a queued rwlock + * @lock : Pointer to queued rwlock structure */ void queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE); + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index cbff6ba53d56..65a9a10caa6f 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -22,6 +22,7 @@ #include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> +#include <trace/events/lock.h> /* * Include queued spinlock statistics code @@ -401,6 +402,8 @@ pv_queue: idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + trace_contention_begin(lock, LCB_F_SPIN); + /* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in @@ -554,6 +557,8 @@ locked: pv_kick_node(lock, next); release: + trace_contention_end(lock, 0); + /* * release the node */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8555c4efe97c..7779ee8abc2a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -24,6 +24,8 @@ #include <linux/sched/wake_q.h> #include <linux/ww_mutex.h> +#include <trace/events/lock.h> + #include "rtmutex_common.h" #ifndef WW_RT @@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, set_current_state(state); + trace_contention_begin(lock, LCB_F_RT); + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); if (likely(!ret)) ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); @@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * unconditionally. We might have to fix that up. */ fixup_rt_mutex_waiters(lock); + + trace_contention_end(lock, ret); + return ret; } @@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) /* Save current state and set state to TASK_RTLOCK_WAIT */ current_save_and_set_rtlock_wait_state(); + trace_contention_begin(lock, LCB_F_RT); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); for (;;) { @@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) */ fixup_rt_mutex_waiters(lock); debug_rt_mutex_free_waiter(&waiter); + + trace_contention_end(lock, 0); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 6fd3162e4098..c201aadb9301 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, * Reader2 to call up_read(), which might be unbound. */ + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ); + /* * For rwlocks this returns 0 unconditionally, so the below * !ret conditionals are optimized out. @@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, raw_spin_unlock_irq(&rtm->wait_lock); if (!ret) rwbase_rtmutex_unlock(rtm); + + trace_contention_end(rwb, ret); return ret; } @@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, goto out_unlock; rwbase_set_and_save_current_state(state); + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE); for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, set_current_state(state); } rwbase_restore_current_state(); + trace_contention_end(rwb, 0); out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index acde5d6f1254..9d1db4a54d34 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT #include "lock_events.h" @@ -375,16 +376,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) * * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of * this function. Modify with care. + * + * Return: true if wait_list isn't empty and false otherwise */ -static inline void +static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); list_del(&waiter->list); if (likely(!list_empty(&sem->wait_list))) - return; + return true; atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); + return false; } /* @@ -559,6 +563,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* + * Remove a waiter and try to wake up other waiters in the wait queue + * This function is called from the out_nolock path of both the reader and + * writer slowpaths with wait_lock held. It releases the wait_lock and + * optionally wake up waiters before it returns. + */ +static inline void +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&sem->wait_lock) +{ + bool first = rwsem_first_waiter(sem) == waiter; + + wake_q_init(wake_q); + + /* + * If the wait_list isn't empty and the waiter to be deleted is + * the first waiter, we wake up the remaining waiters as they may + * be eligible to acquire or spin on the lock. + */ + if (rwsem_del_waiter(sem, waiter) && first) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + if (!wake_q_empty(wake_q)) + wake_up_q(wake_q); +} + +/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. @@ -901,7 +932,7 @@ done: */ static inline void clear_nonspinnable(struct rw_semaphore *sem) { - if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } @@ -926,6 +957,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) #endif /* + * Prepare to wake up waiter(s) in the wait queue by putting them into the + * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely + * reader-owned, wake up read lock waiters in queue front or wake up any + * front waiter otherwise. + + * This is being called from both reader and writer slow paths. + */ +static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, + struct wake_q_head *wake_q) +{ + enum rwsem_wake_type wake_type; + + if (count & RWSEM_WRITER_MASK) + return; + + if (count & RWSEM_READER_MASK) { + wake_type = RWSEM_WAKE_READERS; + } else { + wake_type = RWSEM_WAKE_ANY; + clear_nonspinnable(sem); + } + rwsem_mark_wake(sem, wake_type, wake_q); +} + +/* * Wait for the read lock to be granted */ static struct rw_semaphore __sched * @@ -935,7 +991,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - bool wake = false; /* * To prevent a constant stream of readers from starving a sleeping @@ -977,12 +1032,11 @@ queue: if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer or has the handoff bit set, this reader can - * exit the slowpath and return immediately as its - * RWSEM_READER_BIAS has already been set in the count. + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been set + * in the count. */ - if (!(atomic_long_read(&sem->count) & - (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); @@ -997,22 +1051,13 @@ queue: /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK)) { - clear_nonspinnable(sem); - wake = true; - } - if (wake || (!(count & RWSEM_WRITER_MASK) && - (adjustment & RWSEM_FLAG_WAITERS))) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, count, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); + + trace_contention_begin(sem, LCB_F_READ); /* wait to be given the lock */ for (;;) { @@ -1035,13 +1080,14 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); + trace_contention_end(sem, 0); return sem; out_nolock: - rwsem_del_waiter(sem, &waiter); - raw_spin_unlock_irq(&sem->wait_lock); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } @@ -1051,7 +1097,6 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - long count; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -1075,23 +1120,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* we're now waiting on the lock */ if (rwsem_first_waiter(sem) != &waiter) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and: - * 1) there are no active locks, wake the front - * queued process(es) as the handoff bit might be set. - * 2) there are no active writers and some readers, the lock - * must be read owned; so we try to wake any read lock - * waiters that were queued ahead of us. - */ - if (count & RWSEM_WRITER_MASK) - goto wait; - - rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) - ? RWSEM_WAKE_READERS - : RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), + &wake_q); if (!wake_q_empty(&wake_q)) { /* * We want to minimize wait_lock hold time especially @@ -1099,16 +1129,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); - wake_q_init(&wake_q); /* Used again, reinit */ raw_spin_lock_irq(&sem->wait_lock); } } else { atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } -wait: /* wait until we successfully acquire the lock */ set_current_state(state); + trace_contention_begin(sem, LCB_F_WRITE); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1148,17 +1178,15 @@ trylock_again: __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); + trace_contention_end(sem, 0); return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - rwsem_del_waiter(sem, &waiter); - if (!list_empty(&sem->wait_list)) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); lockevent_inc(rwsem_wlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 9ee381e4d2a4..f2654d2fe43a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -32,6 +32,7 @@ #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> +#include <trace/events/lock.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -205,7 +206,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { struct semaphore_waiter waiter; @@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + int ret; + + trace_contention_begin(sem, 0); + ret = ___down_common(sem, state, timeout); + trace_contention_end(sem, ret); + + return ret; +} + static noinline void __sched __down(struct semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); diff --git a/kernel/panic.c b/kernel/panic.c index eb4dfb932c85..6737b2332275 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -48,7 +48,7 @@ unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = - IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; + IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -579,6 +579,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); + printk_prefer_direct_enter(); + if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -608,6 +610,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + printk_prefer_direct_exit(); } #ifndef __WARN_FLAGS diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..874ad834dc8d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif KASAN_SANITIZE_snapshot.o := n diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..6c373f2960e7 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); - -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) -{ - struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - - seq_printf(s, "%d\n", enabled); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { @@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {} #endif static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) + int nr_states, struct em_data_callback *cb, + unsigned long flags) { unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; @@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); @@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = em_scale_power(table[i].power); + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); if (table[i].cost >= prev_cost) { table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", @@ -197,7 +196,8 @@ free_ps_table: } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) { struct em_perf_domain *pd; struct device *cpu_dev; @@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); if (ret) { kfree(pd); return ret; @@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts) { unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (milliwatts) + flags |= EM_PERF_DOMAIN_MILLIWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; em_cpufreq_update_efficiencies(dev); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..5242bf2ee469 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -545,35 +545,6 @@ static int __init pm_debug_messages_setup(char *str) } __setup("pm_debug_messages", pm_debug_messages_setup); -/** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (!pm_debug_messages_on) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); -} - #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 11b570fcf049..3068601e585a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -6,9 +6,6 @@ * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 330d49937692..2a406753af90 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** +/* * Data types related to memory bitmaps. * * Memory bitmap is a structure consisting of many linked lists of @@ -427,6 +427,10 @@ struct memory_bitmap { /** * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. * * This function is used to allocate inner nodes as well as the * leave nodes of the radix tree. It also adds the node to the @@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. * @bm: Memory bitmap. * * Starting from the last returned position this function searches for the next @@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * alloc_highmem_pages - Allocate some highmem pages for the image. * * Try to allocate as many pages as needed, but if the number of free highmem * pages is less than that, allocate them all. @@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info) } /** - * load header - Check the image header and copy the data from it. + * load_header - Check the image header and copy the data from it. */ static int load_header(struct swsusp_info *info) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index da03c15ecc89..a3e1035929b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -224,6 +224,33 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, static int nr_ext_console_drivers; /* + * Used to synchronize printing kthreads against direct printing via + * console_trylock/console_unlock. + * + * Values: + * -1 = console kthreads atomically blocked (via global trylock) + * 0 = no kthread printing, console not locked (via trylock) + * >0 = kthread(s) actively printing + * + * Note: For synchronizing against direct printing via + * console_lock/console_unlock, see the @lock variable in + * struct console. + */ +static atomic_t console_kthreads_active = ATOMIC_INIT(0); + +#define console_kthreads_atomic_tryblock() \ + (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) +#define console_kthreads_atomic_unblock() \ + atomic_cmpxchg(&console_kthreads_active, -1, 0) +#define console_kthreads_atomically_blocked() \ + (atomic_read(&console_kthreads_active) == -1) + +#define console_kthread_printing_tryenter() \ + atomic_inc_unless_negative(&console_kthreads_active) +#define console_kthread_printing_exit() \ + atomic_dec(&console_kthreads_active) + +/* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ @@ -271,19 +298,49 @@ static bool panic_in_progress(void) } /* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it and are racing, but it helps tracking those weird code - * paths in the console code where we end up in places I want - * locked without the console semaphore held). + * Tracks whether kthread printers are all blocked. A value of true implies + * that the console is locked via console_lock() or the console is suspended. + * Writing to this variable requires holding @console_sem. */ -static int console_locked, console_suspended; +static bool console_kthreads_blocked; + +/* + * Block all kthread printers from a schedulable context. + * + * Requires holding @console_sem. + */ +static void console_kthreads_block(void) +{ + struct console *con; + + for_each_console(con) { + mutex_lock(&con->lock); + con->blocked = true; + mutex_unlock(&con->lock); + } + + console_kthreads_blocked = true; +} /* - * If exclusive_console is non-NULL then only this console is to be printed to. + * Unblock all kthread printers from a schedulable context. + * + * Requires holding @console_sem. */ -static struct console *exclusive_console; +static void console_kthreads_unblock(void) +{ + struct console *con; + + for_each_console(con) { + mutex_lock(&con->lock); + con->blocked = false; + mutex_unlock(&con->lock); + } + + console_kthreads_blocked = false; +} + +static int console_suspended; /* * Array of consoles built from command line options (console=) @@ -366,7 +423,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * A flag to signify if printk_activate_kthreads() has already started the + * kthread printers. If true, any later registered consoles must start their + * own kthread directly. The flag is write protected by the console_lock. + */ +static bool printk_kthreads_available; + #ifdef CONFIG_PRINTK +static atomic_t printk_prefer_direct = ATOMIC_INIT(0); + +/** + * printk_prefer_direct_enter - cause printk() calls to attempt direct + * printing to all enabled consoles + * + * Since it is not possible to call into the console printing code from any + * context, there is no guarantee that direct printing will occur. + * + * This globally effects all printk() callers. + * + * Context: Any context. + */ +void printk_prefer_direct_enter(void) +{ + atomic_inc(&printk_prefer_direct); +} + +/** + * printk_prefer_direct_exit - restore printk() behavior + * + * Context: Any context. + */ +void printk_prefer_direct_exit(void) +{ + WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); +} + +/* + * Calling printk() always wakes kthread printers so that they can + * flush the new message to their respective consoles. Also, if direct + * printing is allowed, printk() tries to flush the messages directly. + * + * Direct printing is allowed in situations when the kthreads + * are not available or the system is in a problematic state. + * + * See the implementation about possible races. + */ +static inline bool allow_direct_printing(void) +{ + /* + * Checking kthread availability is a possible race because the + * kthread printers can become permanently disabled during runtime. + * However, doing that requires holding the console_lock, so any + * pending messages will be direct printed by console_unlock(). + */ + if (!printk_kthreads_available) + return true; + + /* + * Prefer direct printing when the system is in a problematic state. + * The context that sets this state will always see the updated value. + * The other contexts do not care. Anyway, direct printing is just a + * best effort. The direct output is only possible when console_lock + * is not already taken and no kthread printers are actively printing. + */ + return (system_state > SYSTEM_RUNNING || + oops_in_progress || + atomic_read(&printk_prefer_direct)); +} + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -374,12 +499,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -405,6 +524,9 @@ static struct latched_seq clear_seq = { /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 +/* the maximum size for a dropped text message */ +#define DROPPED_TEXT_MAX 64 + /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @@ -746,8 +868,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, goto out; } + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, atomic64_read(&user->seq), r)); + prb_read_valid(prb, + atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } @@ -1513,7 +1646,18 @@ static int syslog_print(char __user *buf, int size) seq = syslog_seq; mutex_unlock(&syslog_lock); - len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ + len = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) @@ -1911,47 +2055,24 @@ static int console_trylock_spinning(void) } /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. + * Call the specified console driver, asking it to write out the specified + * text and length. If @dropped_text is non-NULL and any records have been + * dropped, a dropped message will be written out first. */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; - - trace_console_rcuidle(text, len); + size_t dropped_len; - if (!console_drivers) - return; - - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), + if (con->dropped && dropped_text) { + dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; + con->dropped); + con->dropped = 0; + con->write(con, dropped_text, dropped_len); } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } - } + con->write(con, text, len); } /* @@ -2026,8 +2147,10 @@ static u8 *__printk_recursion_counter(void) int printk_delay_msec __read_mostly; -static inline void printk_delay(void) +static inline void printk_delay(int level) { + boot_delay_msec(level); + if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; @@ -2041,7 +2164,7 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + raw_smp_processor_id(); + 0x80000000 + smp_processor_id(); } /** @@ -2115,6 +2238,8 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } + trace_console_rcuidle(text, text_len); + return text_len; } @@ -2123,7 +2248,6 @@ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; @@ -2133,10 +2257,14 @@ int vprintk_store(int facility, int level, u8 *recursion_ptr; u16 reserve_size; va_list args2; + u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is @@ -2145,8 +2273,7 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); - if (!printk_enter_irqsave(recursion_ptr, irqflags)) - return 0; + caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be @@ -2250,23 +2377,25 @@ asmlinkage int vprintk_emit(int facility, int level, in_sched = true; } - boot_delay_msec(level); - printk_delay(); + printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && allow_direct_printing()) { /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console + * The caller may be holding system-critical or + * timing-sensitive locks. Disable preemption during direct + * printing of all remaining records to all consoles so that + * this context can return as soon as possible. Hopefully + * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. + * semaphore. The release will print out buffers. With the + * spinning variant, this context tries to take over the + * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); @@ -2297,18 +2426,21 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); + +static void printk_start_kthread(struct console *con); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 +#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) @@ -2325,9 +2457,14 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) +{ +} static bool suppress_message_printing(int level) { return false; } +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } +static void printk_start_kthread(struct console *con) { } +static bool allow_direct_printing(void) { return true; } #endif /* CONFIG_PRINTK */ @@ -2515,6 +2652,7 @@ void suspend_console(void) if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_flush(1000, true); console_lock(); console_suspended = 1; up_console_sem(); @@ -2527,6 +2665,7 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + pr_flush(1000, true); } /** @@ -2544,6 +2683,14 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); + else { + /* + * If a new CPU comes online, the conditions for + * printer_should_wake() may have changed for some + * kthread printer with !CON_ANYTIME. + */ + wake_up_klogd(); + } } return 0; } @@ -2563,7 +2710,7 @@ void console_lock(void) down_console_sem(); if (console_suspended) return; - console_locked = 1; + console_kthreads_block(); console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); @@ -2584,33 +2731,32 @@ int console_trylock(void) up_console_sem(); return 0; } - console_locked = 1; + if (!console_kthreads_atomic_tryblock()) { + up_console_sem(); + return 0; + } console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); -int is_console_locked(void) -{ - return console_locked; -} -EXPORT_SYMBOL(is_console_locked); - /* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. + * This is used to help to make sure that certain paths within the VT code are + * running with the console lock held. It is definitely not the perfect debug + * tool (it is not known if the VT code is the task holding the console lock), + * but it helps tracking those weird code paths in the console code such as + * when the console is suspended: where the console is not locked but no + * console printing may occur. + * + * Note: This returns true when the console is suspended but is not locked. + * This is intentional because the VT code must consider that situation + * the same as if the console was locked. */ -static int have_callable_console(void) +int is_console_locked(void) { - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; + return (console_kthreads_blocked || atomic_read(&console_kthreads_active)); } +EXPORT_SYMBOL(is_console_locked); /* * Return true when this CPU should unlock console_sem without pushing all @@ -2631,132 +2777,125 @@ static bool abandon_console_lock_in_panic(void) return atomic_read(&panic_cpu) != raw_smp_processor_id(); } +static inline bool __console_is_usable(short flags) +{ + if (!(flags & CON_ENABLED)) + return false; + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && + !(flags & CON_ANYTIME)) + return false; + + return true; +} + /* - * Can we actually use the console at this time on this cpu? + * Check if the given console is currently capable and allowed to print + * records. * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Requires holding the console_lock. */ -static inline int can_use_console(void) +static inline bool console_is_usable(struct console *con) { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); + if (!con->write) + return false; + + return __console_is_usable(con->flags); } -/** - * console_unlock - unlock the console system +static void __console_unlock(void) +{ + /* + * Depending on whether console_lock() or console_trylock() was used, + * appropriately allow the kthread printers to continue. + */ + if (console_kthreads_blocked) + console_kthreads_unblock(); + else + console_kthreads_atomic_unblock(); + + /* + * New records may have arrived while the console was locked. + * Wake the kthread printers to print them. + */ + wake_up_klogd(); + + up_console_sem(); +} + +/* + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. * - * Releases the console_lock which the caller holds on the console system - * and the console driver list. + * @text is a buffer of size CONSOLE_LOG_MAX. * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. + * If extended messages should be printed, @ext_text is a buffer of size + * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. * - * If there is output waiting, we wake /dev/kmsg and syslog() users. + * If dropped messages should be printed, @dropped_text is a buffer of size + * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. * - * console_unlock(); may be called from any context. + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. A NULL pointer may be provided + * to disable allowing the console_lock to be taken over by a printk waiter. + * + * Returns false if the given console has no next record to print, otherwise + * true. + * + * Requires the console_lock if @handover is non-NULL. + * Requires con->lock otherwise. */ -void console_unlock(void) +static bool __console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - static int panic_console_dropped; - unsigned long flags; - bool do_cond_resched, retry; + static atomic_t panic_console_dropped = ATOMIC_INIT(0); struct printk_info info; struct printk_record r; - u64 __maybe_unused next_seq; - - if (console_suspended) { - up_console_sem(); - return; - } - - prb_rec_init_rd(&r, &info, text, sizeof(text)); - - /* - * Console drivers are called with interrupts disabled, so - * @console_may_schedule should be cleared before; however, we may - * end up dumping a lot of lines, for example, if called from - * console registration path, and should invoke cond_resched() - * between lines if allowable. Not doing so can cause a very long - * scheduling stall on a slow console leading to RCU stall and - * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. - */ - do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } + unsigned long flags; + char *write_text; + size_t len; - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; + if (handover) + *handover = false; - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - if (panic_in_progress() && panic_console_dropped++ > 10) { - suppress_panic_printk = 1; - pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); - } - } + if (!prb_read_valid(prb, con->seq, &r)) + return false; - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; + if (con->seq != r.info->seq) { + con->dropped += r.info->seq - con->seq; + con->seq = r.info->seq; + if (panic_in_progress() && + atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } + } - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } + /* Skip record that has level above the console loglevel. */ + if (suppress_message_printing(r.info->level)) { + con->seq++; + goto skip; + } - /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. - */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; + if (ext_text) { + write_text = ext_text; + len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); + len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + write_text = text; + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } + if (handover) { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to @@ -2770,38 +2909,187 @@ skip: printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); + /* don't trace irqsoff print latency */ + stop_critical_timings(); + } - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; + call_console_driver(con, write_text, len, dropped_text); - /* Allow panic_cpu to take over the consoles safely */ - if (abandon_console_lock_in_panic()) - break; + con->seq++; + + if (handover) { + start_critical_timings(); + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); + } +skip: + return true; +} - if (do_cond_resched) - cond_resched(); +/* + * Print a record for a given console, but allow another printk() caller to + * take over the console_lock and continue printing. + * + * Requires the console_lock, but depending on @handover after the call, the + * caller may no longer have the console_lock. + * + * See __console_emit_next_record() for argument and return details. + */ +static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) +{ + /* + * Handovers are only supported if threaded printers are atomically + * blocked. The context taking over the console_lock may be atomic. + */ + if (!console_kthreads_atomically_blocked()) { + *handover = false; + handover = NULL; } - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; + return __console_emit_next_record(con, text, ext_text, dropped_text, handover); +} - console_locked = 0; - up_console_sem(); +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU or direct printing is not preferred). Regardless the + * reason, the caller should assume it is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + static char dropped_text[DROPPED_TEXT_MAX]; + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; + bool any_usable = false; + struct console *con; + bool any_progress; + + *next_seq = 0; + *handover = false; + + do { + /* Let the kthread printers do the work if they can. */ + if (!allow_direct_printing()) + return false; + + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con)) + continue; + any_usable = true; + + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record_transferable(con, &text[0], + &ext_text[0], NULL, handover); + } else { + progress = console_emit_next_record_transferable(con, &text[0], + NULL, &dropped_text[0], handover); + } + if (*handover) + return false; + + /* Track the next of the highest seq flushed. */ + if (con->seq > *next_seq) + *next_seq = con->seq; + + if (!progress) + continue; + any_progress = true; + + /* Allow panic_cpu to take over the consoles safely. */ + if (abandon_console_lock_in_panic()) + return false; + + if (do_cond_resched) + cond_resched(); + } + } while (any_progress); + + return any_usable; +} + +/** + * console_unlock - unlock the console system + * + * Releases the console_lock which the caller holds on the console system + * and the console driver list. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + bool do_cond_resched; + bool handover; + bool flushed; + u64 next_seq; + + if (console_suspended) { + up_console_sem(); + return; + } /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. + * Console drivers are called with interrupts disabled, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. Therefore, create + * a local to use for the printing loop. */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && !abandon_console_lock_in_panic() && console_trylock()) - goto again; + do_cond_resched = console_may_schedule; + + do { + console_may_schedule = 0; + + flushed = console_flush_all(do_cond_resched, &next_seq, &handover); + if (!handover) + __console_unlock(); + + /* + * Abort if there was a failure to flush all messages to all + * usable consoles. Either it is not possible to flush (in + * which case it would be an infinite loop of retrying) or + * another context has taken over printing. + */ + if (!flushed) + break; + + /* + * Some context may have added new records after + * console_flush_all() but before unlocking the console. + * Re-check if there is a new record to flush. If the trylock + * fails, another context is already handling the printing. + */ + } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); @@ -2832,15 +3120,21 @@ void console_unblank(void) if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; + if (!console_kthreads_atomic_tryblock()) { + up_console_sem(); + return; + } } else console_lock(); - console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); console_unlock(); + + if (!oops_in_progress) + pr_flush(1000, true); } /** @@ -2861,8 +3155,14 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + c->seq = seq; + } console_unlock(); } @@ -2893,6 +3193,7 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { + __pr_flush(console, 1000, true); console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); @@ -2904,6 +3205,7 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -2990,6 +3292,11 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -3097,27 +3404,24 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; + newcon->dropped = 0; + newcon->thread = NULL; + newcon->blocked = true; + mutex_init(&newcon->lock); + if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - console_seq = syslog_seq; + newcon->seq = syslog_seq; mutex_unlock(&syslog_lock); + } else { + /* Begin with next message. */ + newcon->seq = prb_next_seq(prb); } + + if (printk_kthreads_available) + printk_start_kthread(newcon); + console_unlock(); console_sysfs_notify(); @@ -3128,9 +3432,7 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - pr_info("%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); + con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { @@ -3146,12 +3448,11 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { + struct task_struct *thd; struct console *con; int res; - pr_info("%sconsole [%s%d] disabled\n", - (console->flags & CON_BOOT) ? "boot" : "" , - console->name, console->index); + con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) @@ -3188,7 +3489,20 @@ int unregister_console(struct console *console) console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; + + /* + * console->thread can only be cleared under the console lock. But + * stopping the thread must be done without the console lock. The + * task that clears @thread is the task that stops the kthread. + */ + thd = console->thread; + console->thread = NULL; + console_unlock(); + + if (thd) + kthread_stop(thd); + console_sysfs_notify(); if (console->exit) @@ -3284,12 +3598,294 @@ static int __init printk_late_init(void) } late_initcall(printk_late_init); +static int __init printk_activate_kthreads(void) +{ + struct console *con; + + console_lock(); + printk_kthreads_available = true; + for_each_console(con) + printk_start_kthread(con); + console_unlock(); + + return 0; +} +early_initcall(printk_activate_kthreads); + #if defined CONFIG_PRINTK +/* If @con is specified, only wait for that console. Otherwise wait for all. */ +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *c; + u64 last_diff = 0; + u64 printk_seq; + u64 diff; + u64 seq; + + might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + console_lock(); + for_each_console(c) { + if (con && con != c) + continue; + if (!console_is_usable(c)) + continue; + printk_seq = c->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } + console_unlock(); + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + /* no timeout limit */ + msleep(100); + } else if (remaining < 100) { + msleep(remaining); + remaining = 0; + } else { + msleep(100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return __pr_flush(NULL, timeout_ms, reset_on_progress); +} +EXPORT_SYMBOL(pr_flush); + +static void __printk_fallback_preferred_direct(void) +{ + printk_prefer_direct_enter(); + pr_err("falling back to preferred direct printing\n"); + printk_kthreads_available = false; +} + +/* + * Enter preferred direct printing, but never exit. Mark console threads as + * unavailable. The system is then forever in preferred direct printing and + * any printing threads will exit. + * + * Must *not* be called under console_lock. Use + * __printk_fallback_preferred_direct() if already holding console_lock. + */ +static void printk_fallback_preferred_direct(void) +{ + console_lock(); + __printk_fallback_preferred_direct(); + console_unlock(); +} + +/* + * Print a record for a given console, not allowing another printk() caller + * to take over. This is appropriate for contexts that do not have the + * console_lock. + * + * See __console_emit_next_record() for argument and return details. + */ +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text) +{ + return __console_emit_next_record(con, text, ext_text, dropped_text, NULL); +} + +static bool printer_should_wake(struct console *con, u64 seq) +{ + short flags; + + if (kthread_should_stop() || !printk_kthreads_available) + return true; + + if (con->blocked || + console_kthreads_atomically_blocked()) { + return false; + } + + /* + * This is an unsafe read from con->flags, but a false positive is + * not a problem. Worst case it would allow the printer to wake up + * although it is disabled. But the printer will notice that when + * attempting to print and instead go back to sleep. + */ + flags = data_race(READ_ONCE(con->flags)); + + if (!__console_is_usable(flags)) + return false; + + return prb_read_valid(prb, seq, NULL); +} + +static int printk_kthread_func(void *data) +{ + struct console *con = data; + char *dropped_text = NULL; + char *ext_text = NULL; + u64 seq = 0; + char *text; + int error; + + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) { + con_printk(KERN_ERR, con, "failed to allocate text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) { + con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + } else { + dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); + if (!dropped_text) { + con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + } + + con_printk(KERN_INFO, con, "printing thread started\n"); + + for (;;) { + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ + error = wait_event_interruptible(log_wait, + printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */ + + if (kthread_should_stop() || !printk_kthreads_available) + break; + + if (error) + continue; + + error = mutex_lock_interruptible(&con->lock); + if (error) + continue; + + if (con->blocked || + !console_kthread_printing_tryenter()) { + /* Another context has locked the console_lock. */ + mutex_unlock(&con->lock); + continue; + } + + /* + * Although this context has not locked the console_lock, it + * is known that the console_lock is not locked and it is not + * possible for any other context to lock the console_lock. + * Therefore it is safe to read con->flags. + */ + + if (!__console_is_usable(con->flags)) { + console_kthread_printing_exit(); + mutex_unlock(&con->lock); + continue; + } + + /* + * Even though the printk kthread is always preemptible, it is + * still not allowed to call cond_resched() from within + * console drivers. The task may become non-preemptible in the + * console driver call chain. For example, vt_console_print() + * takes a spinlock and then can call into fbcon_redraw(), + * which can conditionally invoke cond_resched(). + */ + console_may_schedule = 0; + console_emit_next_record(con, text, ext_text, dropped_text); + + seq = con->seq; + + console_kthread_printing_exit(); + + mutex_unlock(&con->lock); + } + + con_printk(KERN_INFO, con, "printing thread stopped\n"); +out: + kfree(dropped_text); + kfree(ext_text); + kfree(text); + + console_lock(); + /* + * If this kthread is being stopped by another task, con->thread will + * already be NULL. That is fine. The important thing is that it is + * NULL after the kthread exits. + */ + con->thread = NULL; + console_unlock(); + + return 0; +} + +/* Must be called under console_lock. */ +static void printk_start_kthread(struct console *con) +{ + /* + * Do not start a kthread if there is no write() callback. The + * kthreads assume the write() callback exists. + */ + if (!con->write) + return; + + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + con->thread = NULL; + con_printk(KERN_ERR, con, "unable to start printing thread\n"); + __printk_fallback_preferred_direct(); + return; + } +} + /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_OUTPUT 0x02 +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_DIRECT_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); @@ -3297,41 +3893,71 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_OUTPUT) { + if (pending & PRINTK_PENDING_DIRECT_OUTPUT) { + printk_prefer_direct_enter(); + /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); + + printk_prefer_direct_exit(); } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); -void wake_up_klogd(void) +static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the wait queue is empty. + * + * The full memory barrier within wq_has_sleeper() pairs with the full + * memory barrier within set_current_state() of + * prepare_to_wait_event(), which is called after ___wait_event() adds + * the waiter but before it has checked the wait condition. + * + * This pairs with devkmsg_read:A, syslog_print:A, and + * printk_kthread_func:A. + */ + if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ + (val & PRINTK_PENDING_DIRECT_OUTPUT)) { + this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } +void wake_up_klogd(void) +{ + __wake_up_klogd(PRINTK_PENDING_WAKEUP); +} + void defer_console_output(void) { - if (!printk_percpu_data_ready()) - return; + /* + * New messages may have been added directly to the ringbuffer + * using vprintk_store(), so wake any waiters as well. + */ + int val = PRINTK_PENDING_WAKEUP; - preempt_disable(); - this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); - preempt_enable(); + /* + * Make sure that some context will print the messages when direct + * printing is allowed. This happens in situations when the kthreads + * may not be as reliable or perhaps unusable. + */ + if (allow_direct_printing()) + val |= PRINTK_PENDING_DIRECT_OUTPUT; + + __wake_up_klogd(val); } void printk_trigger_flush(void) @@ -3667,26 +4293,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP -static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); -static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant - * spinning lock is not owned by any CPU. + * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. * * Context: Any context. */ -void __printk_wait_on_cpu_lock(void) +void __printk_cpu_sync_wait(void) { do { cpu_relax(); - } while (atomic_read(&printk_cpulock_owner) != -1); + } while (atomic_read(&printk_cpu_sync_owner) != -1); } -EXPORT_SYMBOL(__printk_wait_on_cpu_lock); +EXPORT_SYMBOL(__printk_cpu_sync_wait); /** - * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant - * spinning lock. + * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant + * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the @@ -3695,7 +4321,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ -int __printk_cpu_trylock(void) +int __printk_cpu_sync_try_get(void) { int cpu; int old; @@ -3705,79 +4331,80 @@ int __printk_cpu_trylock(void) /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with - * __printk_cpu_unlock:B. + * __printk_cpu_sync_put:B. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then - * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_put:A can never read from + * __printk_cpu_sync_try_get:B. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of this CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of this CPU */ - old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, - cpu); /* LMM(__printk_cpu_trylock:A) */ + old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, + cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing - * data: LMM(__printk_cpu_trylock:B) + * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ - atomic_inc(&printk_cpulock_nested); + atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } -EXPORT_SYMBOL(__printk_cpu_trylock); +EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** - * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ -void __printk_cpu_unlock(void) +void __printk_cpu_sync_put(void) { - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); + if (atomic_read(&printk_cpu_sync_nested)) { + atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: - * LMM(__printk_cpu_unlock:A) + * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs - * with __printk_cpu_trylock:A. + * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, - * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of the next CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of the next CPU */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ + atomic_set_release(&printk_cpu_sync_owner, + -1); /* LMM(__printk_cpu_sync_put:B) */ } -EXPORT_SYMBOL(__printk_cpu_unlock); +EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..1c630e573548 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -77,31 +77,56 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + config TASKS_RCU - def_bool PREEMPTION + bool + default n + select IRQ_WORK + +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. config TASKS_RUDE_RCU - def_bool 0 + bool + default n + select IRQ_WORK + +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. config TASKS_TRACE_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -195,6 +220,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU @@ -225,7 +264,7 @@ config RCU_NOCB_CPU config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..9b64e55d4f61 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,9 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -47,9 +44,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests @@ -66,9 +60,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests @@ -91,6 +82,20 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 20 if ANDROID + default 0 if !ANDROID + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..152492d52715 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { @@ -523,6 +525,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -534,14 +538,19 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..277a5bfb37d4 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,16 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -324,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -797,7 +817,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..7120165a9342 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -738,6 +738,50 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + +#ifdef CONFIG_TASKS_RCU + +/* * Definitions for RCU-tasks torture testing. */ @@ -780,47 +824,16 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ +#define TASKS_OPS &tasks_ops, -static void synchronize_rcu_trivial(void) -{ - int cpu; +#else // #ifdef CONFIG_TASKS_RCU - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} +#define TASKS_OPS -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} +#endif // #else #ifdef CONFIG_TASKS_RCU -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; +#ifdef CONFIG_TASKS_RUDE_RCU /* * Definitions for rude RCU-tasks torture testing. @@ -851,6 +864,17 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -893,6 +917,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1178,7 +1211,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1186,6 +1219,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { @@ -1322,6 +1356,17 @@ rcu_torture_fakewriter(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && @@ -2916,10 +2961,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -3016,6 +3063,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3096,9 +3144,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -3320,6 +3368,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..909644abee67 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,16 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -261,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -790,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..50ba70f019de 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/srcu.h> #include "rcu.h" @@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all. + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape (default). + * 0x1x: Convert when excessive contention encountered. + */ +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_AUTO; +module_param(convert_to_big, int, 0444); + +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); + init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + __srcu_transition_to_big(ssp); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + +/* + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. + */ +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +{ + unsigned long j; + + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + +/* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added @@ -343,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. +#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* * Return grace-period delay, zero if there are expedited grace @@ -351,10 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + unsigned long jbase = SRCU_INTERVAL; + + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) + jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -382,13 +561,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } + kfree(ssp->node); + ssp->node = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -434,9 +620,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -445,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -517,7 +709,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -526,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + cbdelay = !!srcu_get_delay(ssp); WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -537,38 +731,45 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -583,6 +784,14 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp, GFP_KERNEL); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -596,20 +805,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) + return; + spin_lock_irqsave_rcu_node(snp, flags); + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -630,39 +843,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + unsigned long sgsne; + struct srcu_node *snp; + struct srcu_node *snp_leaf; unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -678,9 +899,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -814,11 +1041,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, @@ -834,10 +1067,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } @@ -1097,6 +1337,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1104,7 +1366,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1120,27 +1382,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + idx = srcu_read_lock(ssp); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) @@ -1214,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1228,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1318,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -1337,43 +1603,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; + + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); + if (SRCU_SIZING_IS_TORTURE()) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); @@ -1390,6 +1682,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1400,6 +1703,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 99cf3a13954c..3925e32159b5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,7 +46,7 @@ struct rcu_tasks_percpu { /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -77,7 +77,7 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; @@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ .gp_func = gp, \ .call_func = call, \ @@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644); #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -261,14 +266,16 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -278,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; @@ -460,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -509,7 +517,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); if (needgpcb & 0x2) { // Wait for one grace period. @@ -548,8 +558,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -568,7 +585,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -576,7 +603,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -592,10 +619,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); @@ -621,30 +653,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); @@ -950,6 +1002,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..c25ba442044a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1705,11 +1707,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } @@ -2096,14 +2124,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period). The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist. We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2609,6 +2652,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2624,6 +2674,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user) + rcu_tasks_classic_qs(current, false); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -3717,7 +3769,9 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPTION)) + // Invoking preempt_model_*() too early gets a splat. + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || + preempt_model_full() || preempt_model_rt()) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); @@ -4179,6 +4233,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } @@ -4471,6 +4526,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4480,6 +4580,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4497,9 +4598,17 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4745,7 +4854,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4782,7 +4890,7 @@ static void __init kfree_rcu_batch_init(void) void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); @@ -4802,17 +4910,15 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..2ccf5845957d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -254,6 +259,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; @@ -364,6 +370,7 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -421,7 +428,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -439,7 +445,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..0f70f62039a9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -496,7 +593,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) @@ -571,7 +668,7 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; } } @@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 636d0546a4e9..46694e13398a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -215,14 +212,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) @@ -1180,10 +1169,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1241,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct task_struct *t; struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1277,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); - } -} - /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_gp_stride = -1; module_param(rcu_nocb_gp_stride, int, 0444); @@ -1549,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..c8ba0fe17267 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { @@ -660,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -1124,7 +1131,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1226,18 +1234,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1263,10 +1259,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..4995c078cff9 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { @@ -565,9 +593,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -619,6 +647,7 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + printk_prefer_direct_enter(); trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); @@ -626,9 +655,9 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); @@ -656,6 +685,7 @@ static void print_cpu_stall(unsigned long gps) */ set_tsk_need_resched(current); set_preempt_need_resched(); + printk_prefer_direct_exit(); } static void check_cpu_stall(struct rcu_data *rdp) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 180ff9c41fa8..fc7fef575606 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..4177645e74d6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -447,9 +447,11 @@ static int __orderly_reboot(void) ret = run_cmd(reboot_cmd); if (ret) { + printk_prefer_direct_enter(); pr_warn("Failed to start orderly reboot: forcing the issue\n"); emergency_sync(); kernel_restart(NULL); + printk_prefer_direct_exit(); } return ret; @@ -462,6 +464,7 @@ static int __orderly_poweroff(bool force) ret = run_cmd(poweroff_cmd); if (ret && force) { + printk_prefer_direct_enter(); pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* @@ -471,6 +474,7 @@ static int __orderly_poweroff(bool force) */ emergency_sync(); kernel_power_off(); + printk_prefer_direct_exit(); } return ret; @@ -528,6 +532,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); */ static void hw_failure_emergency_poweroff_func(struct work_struct *work) { + printk_prefer_direct_enter(); + /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off @@ -544,6 +550,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); + + printk_prefer_direct_exit(); } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, @@ -582,11 +590,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) { static atomic_t allow_proceed = ATOMIC_INIT(1); + printk_prefer_direct_enter(); + pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) - return; + goto out; /* * Queue a backup emergency shutdown in the event of @@ -594,6 +604,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); +out: + printk_prefer_direct_exit(); } EXPORT_SYMBOL_GPL(hw_protection_shutdown); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index e0104b45029a..d9dc9ab3773f 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -15,6 +15,7 @@ /* Headers: */ #include <linux/sched/clock.h> #include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> #include <linux/sched/posix-timers.h> #include <linux/sched/rt.h> @@ -31,6 +32,7 @@ #include <uapi/linux/sched/types.h> #include "sched.h" +#include "smp.h" #include "autogroup.h" #include "stats.h" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index eec0849b2aae..99bdd96f454f 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -14,6 +14,7 @@ #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> #include <linux/sched/mm.h> #include <linux/sched/rseq_api.h> #include <linux/sched/task_stack.h> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index d9272d9061a3..e374c0c923da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -287,7 +287,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; @@ -349,7 +349,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51efaabac3e4..a247f8d9d417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,7 +26,10 @@ #include <linux/topology.h> #include <linux/sched/clock.h> #include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> #include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> @@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2); raw_spin_rq_lock(rq1); - if (__rq_lockp(rq1) == __rq_lockp(rq2)) - return; + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(rq1, rq2); } #endif @@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - else if (p->sched_class > rq->curr->sched_class) + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); /* @@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(prev->sched_class <= &fair_sched_class && + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -6382,7 +6385,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -8415,6 +8418,18 @@ static void __init preempt_dynamic_init(void) } } +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } @@ -9457,11 +9472,11 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || - &fair_sched_class + 1 != &rt_sched_class || - &rt_sched_class + 1 != &dl_sched_class); + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || + &fair_sched_class != &rt_sched_class + 1 || + &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); + BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif wait_bit_init(); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..936817ae142f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is @@ -1832,6 +1830,7 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; if (READ_ONCE(p->__state) != TASK_WAKING) @@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -2319,13 +2318,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..906b2c7c48d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -36,6 +36,7 @@ #include <linux/sched/cond_resched.h> #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> @@ -313,19 +314,6 @@ const struct sched_class fair_sched_class; #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -493,12 +481,6 @@ static int se_is_idle(struct sched_entity *se) #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -3829,11 +3811,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@ -4846,11 +4828,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } @@ -4864,7 +4846,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -5308,7 +5290,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -6544,108 +6526,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /* - * cpu_util_without: compute cpu utilization without any contributions from *p - * @cpu: the CPU which utilization is requested - * @p: the task which utilization should be discounted - * - * The utilization of a CPU is defined by the utilization of tasks currently - * enqueued on that CPU as well as tasks which are currently sleeping after an - * execution on that CPU. - * - * This method returns the utilization of the specified CPU by discounting the - * utilization of the specified task, whenever the task is currently - * contributing to the CPU utilization. - */ -static unsigned long cpu_util_without(int cpu, struct task_struct *p) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); - - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); - - util = max(util, estimated); - } - - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * cpu_util. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - -/* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. */ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. */ if (task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); @@ -6653,16 +6546,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { + unsigned long util_est; + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util after the task has been enqueued. + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. */ if (dst_cpu == cpu) util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); } @@ -6671,6 +6588,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util_cfs(cpu); + + return cpu_util_next(cpu, p, -1); +} + +/* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization * landscape of @pd's CPUs after the task migration, and uses the Energy Model @@ -9460,8 +9399,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9470,6 +9407,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* @@ -9495,7 +9435,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -11881,101 +11821,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); - -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ecb0d7052877..328cccbee444 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -102,7 +102,7 @@ void __cpuidle default_idle_call(void) * last -- this is very similar to the entry code. */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(_THIS_IP_); + lockdep_hardirqs_on_prepare(); rcu_idle_enter(); lockdep_hardirqs_on(_THIS_IP_); @@ -327,7 +327,7 @@ static void do_idle(void) * RCU relies on this call to be done outside of an RCU read-side * critical section. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index c336f5f481bc..4ff2ed4f8fa1 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a4fa3aadfcba..a337f3e35997 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock); for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a32c46889af8..7891c0f0e1ff 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8dccb34eb190..2ce18584dca3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -603,8 +603,8 @@ struct cfs_rq { s64 runtime_remaining; u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif extern int sched_update_scaling(void); - -extern void flush_smp_call_function_from_idle(void); - -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +#endif /* CONFIG_SMP */ #include "stats.h" @@ -2182,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) * * include/asm-generic/vmlinux.lds.h * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * * Also enforce alignment on the instance, not the type, to guarantee layout. */ #define DEFINE_SCHED_CLASS(name) \ @@ -2190,17 +2187,16 @@ const struct sched_class name##_sched_class \ __section("__" #name "_sched_class") /* Defined in include/asm-generic/vmlinux.lds.h */ -extern struct sched_class __begin_sched_classes[]; -extern struct sched_class __end_sched_classes[]; - -#define sched_class_highest (__end_sched_classes - 1) -#define sched_class_lowest (__begin_sched_classes - 1) +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class--) + for (class = (_from); class < (_to); class++) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, sched_class_lowest) + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -2309,6 +2305,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); @@ -2478,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif #ifdef CONFIG_SMP @@ -2543,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) - return 0; - - if (likely(raw_spin_rq_trylock(busiest))) + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); return 0; + } if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); return 0; } @@ -2644,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) BUG_ON(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..2eb23dd0f285 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -7,3 +7,9 @@ extern void sched_ttwu_pending(void *arg); extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_queue(void); +#else +static inline void flush_smp_call_function_queue(void) { } +#endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b5ac87f6dbd4..e9852d1b4a5e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged + * @wait_killable_recv: Put notifying process in killable state once the + * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information @@ -220,6 +222,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + bool wait_killable_recv; struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags, if (flags & SECCOMP_FILTER_FLAG_LOG) filter->log = true; + /* Set wait killable flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) + filter->wait_killable_recv = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn complete(&addfd->completion); } +static bool should_sleep_killable(struct seccomp_filter *match, + struct seccomp_knotif *n) +{ + return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -1100,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall, n.data = sd; n.id = seccomp_next_notify_id(match); init_completion(&n.ready); - list_add(&n.list, &match->notif->notifications); + list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); @@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall, * This is where we wait for a reply from userspace. */ do { + bool wait_killable = should_sleep_killable(match, &n); + mutex_unlock(&match->notify_lock); - err = wait_for_completion_interruptible(&n.ready); + if (wait_killable) + err = wait_for_completion_killable(&n.ready); + else + err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); - if (err != 0) + + if (err != 0) { + /* + * Check to see if the notifcation got picked up and + * whether we should switch to wait killable. + */ + if (!wait_killable && should_sleep_killable(match, &n)) + continue; + goto interrupted; + } addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); @@ -1484,6 +1511,9 @@ out: mutex_lock(&filter->notify_lock); knotif = find_notification(filter, unotif.id); if (knotif) { + /* Reset the process to make sure it's not stuck */ + if (should_sleep_killable(filter, knotif)) + complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); } @@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags, ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; + /* + * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense + * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag. + */ + if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) && + ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..e43bc2a692f5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1805,7 +1805,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif -int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; @@ -1817,7 +1817,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) info.si_perf_data = sig_data; info.si_perf_type = type; - return force_sig_info(&info); + /* + * Signals generated by perf events should not terminate the whole + * process if SIGTRAP is blocked, however, delivering the signal + * asynchronously is better than not delivering at all. But tell user + * space if the signal was asynchronous, so it can clearly be + * distinguished from normal synchronous ones. + */ + info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? + TRAP_PERF_FLAG_ASYNC : + 0; + + return send_sig_info(info.si_signo, &info, current); } /** @@ -3432,6 +3443,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3509,6 +3521,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4722,6 +4735,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); + CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); diff --git a/kernel/smp.c b/kernel/smp.c index 65a630f62363..dd215f439426 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); -static void flush_smp_call_function_queue(bool warn_cpu_offline); +static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { @@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * ensure that the outgoing CPU doesn't go offline with work * still pending. */ - flush_smp_call_function_queue(false); + __flush_smp_call_function_queue(false); irq_work_run(); return 0; } @@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; @@ -541,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_GOTIPI); - flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(true); } /** - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. @@ -558,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void) * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ -static void flush_smp_call_function_queue(bool warn_cpu_offline) +static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; @@ -681,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_processor_id(), CFD_SEQ_HDLEND); } -void flush_smp_call_function_from_idle(void) + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * from task context (idle, migration thread) + * + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to + * handle queued SMP function calls before scheduling. + * + * The migration thread has to ensure that an eventually pending wakeup has + * been handled before it migrates a task. + */ +void flush_smp_call_function_queue(void) { + unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) @@ -691,9 +708,11 @@ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); - flush_smp_call_function_queue(true); + /* Get the already pending soft interrupts for RT enabled kernels */ + was_pending = local_softirq_pending(); + __flush_smp_call_function_queue(true); if (local_softirq_pending()) - do_softirq(); + do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f6bc0bc8a2aa..b9f54544e749 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -392,6 +392,13 @@ int cpu_check_up_prepare(int cpu) */ return -EAGAIN; + case CPU_UP_PREPARE: + /* + * Timeout while waiting for the CPU to show up. Allow to try + * again later. + */ + return 0; + default: /* Should not happen. Famous last words. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index fac801815554..9f0aef8aa9ff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -294,6 +294,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +/* + * flush_smp_call_function_queue() can raise a soft interrupt in a function + * call. On RT kernels this is undesired and the only known functionality + * in the block layer which does this is disabled on RT. If soft interrupts + * get raised which haven't been raised before the flush, warn so it can be + * investigated. + */ +void do_softirq_post_smp_call_flush(unsigned int was_pending) +{ + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + invoke_softirq(); +} + #else /* CONFIG_PREEMPT_RT */ /* diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ddb5a7f48d69..c2c33d2202e9 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,59 +70,81 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void noinstr stackleak_erase(void) +static __always_inline void __stackleak_erase(bool on_task_stack) { - /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ - unsigned long kstack_ptr = current->lowest_stack; - unsigned long boundary = (unsigned long)end_of_stack(current); - unsigned int poison_count = 0; - const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); - - if (skip_erasing()) - return; - - /* Check that 'lowest_stack' value is sane */ - if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) - kstack_ptr = boundary; + const unsigned long task_stack_low = stackleak_task_low_bound(current); + const unsigned long task_stack_high = stackleak_task_high_bound(current); + unsigned long erase_low, erase_high; - /* Search for the poison value in the kernel stack */ - while (kstack_ptr > boundary && poison_count <= depth) { - if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) - poison_count++; - else - poison_count = 0; - - kstack_ptr -= sizeof(unsigned long); - } - - /* - * One 'long int' at the bottom of the thread stack is reserved and - * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). - */ - if (kstack_ptr == boundary) - kstack_ptr += sizeof(unsigned long); + erase_low = stackleak_find_top_of_poison(task_stack_low, + current->lowest_stack); #ifdef CONFIG_STACKLEAK_METRICS - current->prev_lowest_stack = kstack_ptr; + current->prev_lowest_stack = erase_low; #endif /* - * Now write the poison value to the kernel stack. Start from - * 'kstack_ptr' and move up till the new 'boundary'. We assume that - * the stack pointer doesn't change when we write poison. + * Write poison to the task's stack between 'erase_low' and + * 'erase_high'. + * + * If we're running on a different stack (e.g. an entry trampoline + * stack) we can erase everything below the pt_regs at the top of the + * task stack. + * + * If we're running on the task stack itself, we must not clobber any + * stack used by this function and its caller. We assume that this + * function has a fixed-size stack frame, and the current stack pointer + * doesn't change while we write poison. */ - if (on_thread_stack()) - boundary = current_stack_pointer; + if (on_task_stack) + erase_high = current_stack_pointer; else - boundary = current_top_of_stack(); + erase_high = task_stack_high; - while (kstack_ptr < boundary) { - *(unsigned long *)kstack_ptr = STACKLEAK_POISON; - kstack_ptr += sizeof(unsigned long); + while (erase_low < erase_high) { + *(unsigned long *)erase_low = STACKLEAK_POISON; + erase_low += sizeof(unsigned long); } /* Reset the 'lowest_stack' value for the next syscall */ - current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; + current->lowest_stack = task_stack_high; +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can be called from the task stack or an entry stack when the task stack is + * no longer in use. + */ +asmlinkage void noinstr stackleak_erase(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(on_thread_stack()); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from the task stack. + */ +asmlinkage void noinstr stackleak_erase_on_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(true); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from a stack other than the task stack. + */ +asmlinkage void noinstr stackleak_erase_off_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(false); } void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) @@ -139,8 +161,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) /* 'lowest_stack' should be aligned on the register width boundary */ sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && - sp >= (unsigned long)task_stack_page(current) + - sizeof(unsigned long)) { + sp >= stackleak_task_low_bound(current)) { current->lowest_stack = sp; } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..cedb17ba158a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -535,8 +535,6 @@ void stop_machine_park(int cpu) kthread_park(stopper->thread); } -extern void sched_set_stop_task(int cpu, struct task_struct *stop); - static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); @@ -633,6 +631,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) } EXPORT_SYMBOL_GPL(stop_machine); +#ifdef CONFIG_SCHED_SMT +int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + struct multi_stop_data msdata = { + .fn = fn, + .data = data, + .num_threads = cpumask_weight(smt_mask), + .active_cpus = smt_mask, + }; + + lockdep_assert_cpus_held(); + + /* Set the initial state and stop all online cpus. */ + set_state(&msdata, MULTI_STOP_PREPARE); + return stop_cpus(smt_mask, multi_cpu_stop, &msdata); +} +EXPORT_SYMBOL_GPL(stop_core_cpuslocked); +#endif + /** * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU * @fn: the function to run diff --git a/kernel/sys.c b/kernel/sys.c index 374f83e95239..b911fa6d81ab 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -117,6 +117,12 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef SME_SET_VL +# define SME_SET_VL(a) (-EINVAL) +#endif +#ifndef SME_GET_VL +# define SME_GET_VL() (-EINVAL) +#endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif @@ -2541,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_SME_SET_VL: + error = SME_SET_VL(arg2); + break; + case PR_SME_GET_VL: + error = SME_GET_VL(); + break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..5b7b1a82ae6a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2288,17 +2288,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_BPF_SYSCALL { .procname = "unprivileged_bpf_disabled", diff --git a/kernel/task_work.c b/kernel/task_work.c index c59e1a49bc40..dff75bcde151 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the - * it will interrupt the targeted task and run the task_work. @TWA_RESUME - * work is run only when the task exits the kernel and returns to user mode, - * or before entering guest mode. Fails if the @task is exiting/exited and thus - * it can't process this @work. Otherwise @work->func() will be called when the - * @task goes through one of the aforementioned transitions, or exits. + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted + * task and run the task_work, regardless of whether the task is currently + * running in the kernel or userspace. + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a + * reschedule IPI to force the targeted task to reschedule and run task_work. + * This can be advantageous if there's no strict requirement that the + * task_work be run as soon as possible, just whenever the task enters the + * kernel anyway. + * @TWA_RESUME work is run only when the task exits the kernel and returns to + * user mode, or before entering guest mode. + * + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task goes through one of + * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; + case TWA_SIGNAL_NO_IPI: + __set_notify_signal(task); + break; default: WARN_ON_ONCE(1); break; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 003ccf338d20..5d85014d59b5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -690,7 +690,7 @@ static ssize_t unbind_device_store(struct device *dev, { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); - struct clock_event_device *ce; + struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; @@ -698,9 +698,10 @@ static ssize_t unbind_device_store(struct device *dev, ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); - list_for_each_entry(ce, &clockevent_devices, list) { - if (!strcmp(ce->name, name)) { - ret = __clockevents_try_unbind(ce, dev->id); + list_for_each_entry(iter, &clockevent_devices, list) { + if (!strcmp(iter->name, name)) { + ret = __clockevents_try_unbind(iter, dev->id); + ce = iter; break; } } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 95d7ca35bdf2..cee5da1e54c4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); - if (cpumask_weight(&cpus_chosen) == 0) { + if (cpumask_empty(&cpus_chosen)) { preempt_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..8464c5acc913 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,6 +8,7 @@ #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/kernel.h> +#include <linux/math.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/sched/clock.h> @@ -199,15 +200,13 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) r = rate; if (r >= 4000000) { - r /= 1000000; + r = DIV_ROUND_CLOSEST(r, 1000000); r_unit = 'M'; + } else if (r >= 4000) { + r = DIV_ROUND_CLOSEST(r, 1000); + r_unit = 'k'; } else { - if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else { - r_unit = ' '; - } + r_unit = ' '; } /* Calculate the ns resolution of this counter */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d257721c68b8..58a11f859ac7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -928,6 +928,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -1364,9 +1366,15 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); - /* No need to reprogram if we are running tickless */ - if (unlikely(ts->tick_stopped)) + if (unlikely(ts->tick_stopped)) { + /* + * The clockevent device is not reprogrammed, so change the + * clock event device to ONESHOT_STOPPED to avoid spurious + * interrupts on devices which might not be truly one shot. + */ + tick_program_event(KTIME_MAX, 1); return; + } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dcdcb85121e4..8e4b3c32fcf9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> +#include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> @@ -429,6 +430,14 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } +static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) +{ + u64 delta, cycles = tk_clock_read(tkr); + + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; @@ -439,12 +448,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); + now += fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -482,7 +486,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 ktime_get_mono_fast_ns(void) +u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } @@ -494,7 +498,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ -u64 ktime_get_raw_fast_ns(void) +u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } @@ -528,10 +532,27 @@ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); +/** + * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. + * + * The same limitations as described for ktime_get_boot_fast_ns() apply. The + * mono time and the TAI offset are not read atomically which may yield wrong + * readouts. However, an update of the TAI offset is an rare event e.g., caused + * by settime or adjtimex with an offset. The user of this function has to deal + * with the possibility of wrong timestamps in post processing. + */ +u64 notrace ktime_get_tai_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); +} +EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); + static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; @@ -543,10 +564,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - - delta = timekeeping_delta_to_ns(tkr, - clocksource_delta(tk_clock_read(tkr), - tkr->cycle_last, tkr->mask)); + delta = fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -2380,6 +2398,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return 0; } +/** + * random_get_entropy_fallback - Returns the raw clock source value, + * used by random.c for platforms with no valid random_get_entropy(). + */ +unsigned long random_get_entropy_fallback(void) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(timekeeping_suspended || !clock)) + return 0; + return clock->read(clock); +} +EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9dd2a39cb3b0..717fcb9fb14a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -44,6 +44,7 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> +#include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP -unsigned int sysctl_timer_migration = 1; +static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); @@ -234,7 +235,42 @@ static void timers_update_migration(void) else static_branch_disable(&timers_migration_enabled); } -#else + +#ifdef CONFIG_SYSCTL +static int timer_migration_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static struct ctl_table timer_sysctl[] = { + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init timer_sysctl_init(void) +{ + register_sysctl("kernel", timer_sysctl); + return 0; +} +device_initcall(timer_sysctl_init); +#endif /* CONFIG_SYSCTL */ +#else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ @@ -251,19 +287,6 @@ void timers_update_nohz(void) schedule_work(&timer_update_work); } -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - mutex_lock(&timer_keys_mutex); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!ret && write) - timers_update_migration(); - mutex_unlock(&timer_keys_mutex); - return ret; -} - static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); @@ -502,7 +525,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl, * * Round up with level granularity to prevent this. */ - expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } @@ -615,9 +638,39 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer static const struct debug_obj_descr timer_debug_descr; +struct timer_hint { + void (*function)(struct timer_list *t); + long offset; +}; + +#define TIMER_HINT(fn, container, timr, hintfn) \ + { \ + .function = fn, \ + .offset = offsetof(container, hintfn) - \ + offsetof(container, timr) \ + } + +static const struct timer_hint timer_hints[] = { + TIMER_HINT(delayed_work_timer_fn, + struct delayed_work, timer, work.func), + TIMER_HINT(kthread_delayed_work_timer_fn, + struct kthread_delayed_work, timer, work.func), +}; + static void *timer_debug_hint(void *addr) { - return ((struct timer_list *) addr)->function; + struct timer_list *timer = addr; + int i; + + for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { + if (timer_hints[i].function == timer->function) { + void (**fn)(void) = addr + timer_hints[i].offset; + + return *fn; + } + } + + return timer->function; } static bool timer_is_static_object(void *addr) @@ -1780,8 +1833,6 @@ void update_process_times(int user_tick) { struct task_struct *p = current; - PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); - /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); @@ -1953,6 +2004,7 @@ int timers_prepare_cpu(unsigned int cpu) base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..debbbb083286 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -144,6 +144,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool @@ -728,6 +729,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on FTRACE_MCOUNT_RECORD + select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT def_bool y diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4d5629196d01..10a32b0f2deb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, - const char *fmt, ...) +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; + u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) @@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_id(blkcg->css.cgroup) : 1); -#else - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; #endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__trace_note_message); +EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) @@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, NULL, "%s", msg); + __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -783,6 +783,7 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { + struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ @@ -790,9 +791,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; - if (!bio->bi_blkg) + blkcg_css = bio_blkcg_css(bio); + if (!blkcg_css) return 0; - return cgroup_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8f4fb328133a..3fd5284f6487 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -30,6 +30,24 @@ int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; +/* + * archs can override this function if they must do something + * to enable hook for graph tracer. + */ +int __weak ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * to disable hook for graph tracer. + */ +int __weak ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} + /** * ftrace_graph_stop - set to permanently disable function graph tracing * @@ -404,9 +422,9 @@ free: static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { unsigned long long timestamp; int index; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..af899b058c8d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4de111fa18f..124f1897fd56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#elif defined(CONFIG_PREEMPT_RT) - "preempt_rt", -#else + preempt_model_none() ? "server" : + preempt_model_voluntary() ? "desktop" : + preempt_model_full() ? "preempt" : + preempt_model_rt() ? "preempt_rt" : "unknown", -#endif /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e11e167b7809..f97de82d1342 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e9ae1f33a7f0..afb92e2f0aea 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) */ static void trace_sched_switch_callback(void *data, bool preempt, - unsigned int prev_state, struct task_struct *p, - struct task_struct *n) + struct task_struct *n, + unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f4938040c228..95b58bd757ce 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,7 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -94,7 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 45796d8bd4b2..c9ffdcfe622e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 46429f9a96fa..330aee1c1a49 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9166220457bc..40024e03d422 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + printk_prefer_direct_enter(); + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); + + printk_prefer_direct_exit(); } return HRTIMER_RESTART; diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 247bf0b1582c..701f35f0e2d4 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; + printk_prefer_direct_enter(); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); @@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + printk_prefer_direct_exit(); + __this_cpu_write(hard_watchdog_warn, true); return; } |