summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/backtracetest.c11
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/bpf/verifier.c76
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cpu.c91
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/events/core.c12
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c188
-rw-r--r--kernel/iomem.c4
-rw-r--r--kernel/irq/devres.c3
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/timings.c522
-rw-r--r--kernel/irq_work.c75
-rw-r--r--kernel/jump_label.c63
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/latencytop.c29
-rw-r--r--kernel/livepatch/transition.c22
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h59
-rw-r--r--kernel/locking/lock_events_list.h67
-rw-r--r--kernel/locking/lockdep.c357
-rw-r--r--kernel/locking/lockdep_internals.h34
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/percpu-rwsem.c2
-rw-r--r--kernel/locking/qspinlock.c8
-rw-r--r--kernel/locking/qspinlock_paravirt.h19
-rw-r--r--kernel/locking/qspinlock_stat.h242
-rw-r--r--kernel/locking/rwsem-spinlock.c339
-rw-r--r--kernel/locking/rwsem-xadd.c204
-rw-r--r--kernel/locking/rwsem.c25
-rw-r--r--kernel/locking/rwsem.h174
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/panic.c7
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/hibernate.c12
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcuperf.c5
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcutiny.c9
-rw-r--r--kernel/rcu/srcutree.c32
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c508
-rw-r--r--kernel/rcu/tree.h14
-rw-r--r--kernel/rcu/tree_exp.h36
-rw-r--r--kernel/rcu/tree_plugin.h257
-rw-r--r--kernel/rcu/tree_stall.h709
-rw-r--r--kernel/rcu/update.c59
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/rseq.c9
-rw-r--r--kernel/sched/core.c128
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c1
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c25
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h18
-rw-r--r--kernel/sched/topology.c31
-rw-r--r--kernel/seccomp.c17
-rw-r--r--kernel/softirq.c51
-rw-r--r--kernel/stacktrace.c333
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-broadcast.c48
-rw-r--r--kernel/time/tick-common.c52
-rw-r--r--kernel/time/tick-internal.h10
-rw-r--r--kernel/time/tick-sched.c37
-rw-r--r--kernel/time/tick-sched.h13
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timekeeping.c24
-rw-r--r--kernel/time/timer.c30
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/bpf_trace.c8
-rw-r--r--kernel/trace/trace.c105
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_events_hist.c14
-rw-r--r--kernel/trace/trace_stack.c85
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c54
-rw-r--r--kernel/workqueue_internal.h5
92 files changed, 3280 insertions, 2575 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index fbba478ae522..bf770d7556f7 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER
config RWSEM_SPIN_ON_OWNER
def_bool y
- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
config LOCK_SPIN_ON_OWNER
def_bool y
@@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
depends on SMP
+
+config ARCH_HAS_MMIOWB
+ bool
+
+config MMIOWB
+ def_bool y if ARCH_HAS_MMIOWB
+ depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..62471e75a2b0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a563c8fdad0d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void)
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
- struct stack_trace trace;
unsigned long entries[8];
+ unsigned int nr_entries;
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- trace.nr_entries = 0;
- trace.max_entries = ARRAY_SIZE(entries);
- trace.entries = entries;
- trace.skip = 0;
-
- save_stack_trace(&trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ stack_trace_print(entries, nr_entries, 0);
}
#else
static void backtrace_test_saved(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_unlock_ro(hdr);
bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c5a41f7f338..09d5d972c9ff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4138,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static void __find_good_pkt_pointers(struct bpf_func_state *state,
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type, u16 new_range)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
+ }
+}
+
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs, *reg;
u16 new_range;
- int i, j;
+ int i;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -4211,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == type && regs[i].id == dst_reg->id)
- /* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, new_range);
-
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
+ new_range);
}
/* compute branch direction of the expression "if (reg opcode val) goto target;"
@@ -4698,6 +4707,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
+static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
+ bool is_null)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }
+}
+
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -4705,10 +4730,10 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i, j;
+ int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -4717,17 +4742,8 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &regs[i], id, is_null);
-
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4834c4214e9c..6a1942ed781c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
* Must be called with cpuset_mutex held.
*
* The three key local variables below are:
- * q - a linked-list queue of cpuset pointers, used to implement a
- * top-down scan of all cpusets. This scan loads a pointer
- * to each cpuset marked is_sched_load_balance into the
- * array 'csa'. For our purposes, rebuilding the schedulers
- * sched domains, we can ignore !is_sched_load_balance cpusets.
+ * cp - cpuset pointer, used (together with pos_css) to perform a
+ * top-down scan of all cpusets. For our purposes, rebuilding
+ * the schedulers sched domains, we can ignore !is_sched_load_
+ * balance cpusets.
* csa - (for CpuSet Array) Array of pointers to all the cpusets
* that need to be load balanced, for convenient iterative
* access by the subsequent code that finds the best partition,
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- struct cpuset *cp; /* scans q */
+ struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6754f3ecfd94..f2ef10460698 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
@@ -860,6 +861,8 @@ static int take_cpu_down(void *_param)
/* Give up timekeeping duties */
tick_handover_do_timer();
+ /* Remove CPU from timer broadcasting */
+ tick_offline_cpu(cpu);
/* Park the stopper thread */
stop_machine_park(cpu);
return 0;
@@ -1199,8 +1202,15 @@ int freeze_secondary_cpus(int primary)
int cpu, error = 0;
cpu_maps_update_begin();
- if (!cpu_online(primary))
+ if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
+ if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
+ primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ } else {
+ if (!cpu_online(primary))
+ primary = cpumask_first(cpu_online_mask);
+ }
+
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -2033,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static const char *smt_states[] = {
- [CPU_SMT_ENABLED] = "on",
- [CPU_SMT_DISABLED] = "off",
- [CPU_SMT_FORCE_DISABLED] = "forceoff",
- [CPU_SMT_NOT_SUPPORTED] = "notsupported",
-};
-
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
-}
-
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
@@ -2116,9 +2113,10 @@ static int cpuhp_smt_enable(void)
return ret;
}
+
static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
int ctrlval, ret;
@@ -2156,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,
unlock_device_hotplug();
return ret ? ret : count;
}
+
+#else /* !CONFIG_HOTPLUG_SMT */
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG_SMT */
+
+static const char *smt_states[] = {
+ [CPU_SMT_ENABLED] = "on",
+ [CPU_SMT_DISABLED] = "off",
+ [CPU_SMT_FORCE_DISABLED] = "forceoff",
+ [CPU_SMT_NOT_SUPPORTED] = "notsupported",
+ [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ const char *state = smt_states[cpu_smt_control];
+
+ return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __store_smt_control(dev, attr, buf, count);
+}
static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
static ssize_t
show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
{
- bool active = topology_max_smt_threads() > 1;
-
- return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+ return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
@@ -2179,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {
NULL
};
-static int __init cpu_smt_state_init(void)
+static int __init cpu_smt_sysfs_init(void)
{
return sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_smt_attr_group);
}
-#else
-static inline int cpu_smt_state_init(void) { return 0; }
-#endif
-
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
- ret = cpu_smt_state_init();
+ ret = cpu_smt_sysfs_init();
if (ret)
return ret;
@@ -2214,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void)
return 0;
}
device_initcall(cpuhp_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -2304,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void)
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
+
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ if (!strcmp(arg, "off"))
+ cpu_mitigations = CPU_MITIGATIONS_OFF;
+ else if (!strcmp(arg, "auto"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO;
+ else if (!strcmp(arg, "auto,nosmt"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+
+ return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index a218e43cc382..badd77670d00 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
int sg_mapped_ents;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
- struct stack_trace stacktrace;
- unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+ unsigned int stack_len;
+ unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
#endif
};
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
#ifdef CONFIG_STACKTRACE
if (entry) {
pr_warning("Mapped at:\n");
- print_stack_trace(&entry->stacktrace, 0);
+ stack_trace_print(entry->stack_entries, entry->stack_len, 0);
}
#endif
}
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
spin_unlock_irqrestore(&free_entries_lock, flags);
#ifdef CONFIG_STACKTRACE
- entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
- entry->stacktrace.entries = entry->st_entries;
- entry->stacktrace.skip = 1;
- save_stack_trace(&entry->stacktrace);
+ entry->stack_len = stack_trace_save(entry->stack_entries,
+ ARRAY_SIZE(entry->stack_entries),
+ 1);
#endif
-
return entry;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dc7dead2d2cc..abbd4b3b96c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2478,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_pmu_enable(cpuctx->ctx.pmu);
}
+void perf_pmu_resched(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
@@ -11917,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)
}
}
-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5eedb49a65ea..674b35383491 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -610,8 +610,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* PMU requests more than one contiguous chunks of memory
* for SW double buffering
*/
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
- !overwrite) {
+ if (!overwrite) {
if (!max_order)
return -EINVAL;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..e6a0d6be87e3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
-static int __init init_uprobes(void)
+void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- if (percpu_init_rwsem(&dup_mmap_sem))
- return -ENOMEM;
+ BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
- return register_die_notifier(&uprobe_exception_nb);
+ BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
-__initcall(init_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..fbe9dfcd8680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ void __init fork_init(void)
#endif
lockdep_init_task(&init_task);
+ uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
-/*
- * Allocate a new mm structure and copy contents from the
- * mm structure of the passed in task structure.
+/**
+ * dup_mm() - duplicates an existing mm structure
+ * @tsk: the task_struct with which the new mm will be associated.
+ * @oldmm: the mm to duplicate.
+ *
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
+ * content into it.
+ *
+ * Return: the duplicated mm or NULL on failure.
*/
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk,
+ struct mm_struct *oldmm)
{
- struct mm_struct *mm, *oldmm = current->mm;
+ struct mm_struct *mm;
int err;
mm = allocate_mm();
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
}
retval = -ENOMEM;
- mm = dup_mm(tsk);
+ mm = dup_mm(tsk, current->mm);
if (!mm)
goto fail_nomem;
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
return task;
}
+struct mm_struct *copy_init_mm(void)
+{
+ return dup_mm(NULL, &init_mm);
+}
+
/*
* Ok, this is the main fork-routine.
*
diff --git a/kernel/futex.c b/kernel/futex.c
index 9e40cf7be606..6262f1534ac9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
+ int err;
u32 uninitialized_var(curval);
if (unlikely(should_fail_futex(true)))
return -EFAULT;
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
/* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
- ret = -EFAULT;
-
- } else if (curval != uval) {
+ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
* try the TID->0 transition) raced with a waiter setting the
@@ -1700,32 +1700,32 @@ retry_private:
double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
-
double_unlock_hb(hb1, hb2);
-#ifndef CONFIG_MMU
- /*
- * we don't get EFAULT from MMU faults if we don't have an MMU,
- * but we might get them from range checking
- */
- ret = op_ret;
- goto out_put_keys;
-#endif
-
- if (unlikely(op_ret != -EFAULT)) {
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
ret = op_ret;
goto out_put_keys;
}
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- goto out_put_keys;
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ goto out_put_keys;
+ }
- if (!(flags & FLAGS_SHARED))
+ if (!(flags & FLAGS_SHARED)) {
+ cond_resched();
goto retry_private;
+ }
put_futex_key(&key2);
put_futex_key(&key1);
+ cond_resched();
goto retry;
}
@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
u32 uval, uninitialized_var(curval), newval;
struct task_struct *oldowner, *newowner;
u32 newtid;
- int ret;
+ int ret, err = 0;
lockdep_assert_held(q->lock_ptr);
@@ -2421,14 +2421,17 @@ retry:
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
- if (get_futex_value_locked(&uval, uaddr))
- goto handle_fault;
+ err = get_futex_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- goto handle_fault;
+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
if (curval == uval)
break;
uval = curval;
@@ -2456,23 +2459,37 @@ retry:
return 0;
/*
- * To handle the page fault we need to drop the locks here. That gives
- * the other task (either the highest priority waiter itself or the
- * task which stole the rtmutex) the chance to try the fixup of the
- * pi_state. So once we are back from handling the fault we need to
- * check the pi_state after reacquiring the locks and before trying to
- * do another fixup. When the fixup has been done already we simply
- * return.
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
*
* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
* drop hb->lock since the caller owns the hb -> futex_q relation.
* Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
-handle_fault:
+handle_err:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
- ret = fault_in_user_writeable(uaddr);
+ switch (err) {
+ case -EFAULT:
+ ret = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ ret = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ ret = err;
+ break;
+ }
spin_lock(q->lock_ptr);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -3041,10 +3058,8 @@ retry:
* A unconditional UNLOCK_PI op raced against a waiter
* setting the FUTEX_WAITERS bit. Try again.
*/
- if (ret == -EAGAIN) {
- put_futex_key(&key);
- goto retry;
- }
+ if (ret == -EAGAIN)
+ goto pi_retry;
/*
* wake_futex_pi has detected invalid state. Tell user
* space.
@@ -3059,9 +3074,19 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
spin_unlock(&hb->lock);
- goto pi_faulted;
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ goto out_putkey;
+ }
}
/*
@@ -3075,6 +3100,11 @@ out_putkey:
put_futex_key(&key);
return ret;
+pi_retry:
+ put_futex_key(&key);
+ cond_resched();
+ goto retry;
+
pi_faulted:
put_futex_key(&key);
@@ -3435,6 +3465,7 @@ err_unlock:
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
+ int err;
/* Futex address must be 32bit aligned */
if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
@@ -3444,42 +3475,57 @@ retry:
if (get_user(uval, uaddr))
return -1;
- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
if (fault_in_user_writeable(uaddr))
return -1;
goto retry;
- }
- if (nval != uval)
+
+ case -EAGAIN:
+ cond_resched();
goto retry;
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
}
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
return 0;
}
diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..93c264444510 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
* a pointer into the direct map.
*
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
/*
- * MEMREMAP_WB is special in that it can be satisifed
+ * MEMREMAP_WB is special in that it can be satisfied
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in System RAM.
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f808c6a97dcc..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+ gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc)
irq_init_generic_chip(gc, name, num_ct,
irq_base, reg_base, handler);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1401afa0d58a..53a081392115 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
desc->affinity_notify = notify;
raw_spin_unlock_irqrestore(&desc->lock, flags);
- if (old_notify)
+ if (old_notify) {
+ cancel_work_sync(&old_notify->work);
kref_put(&old_notify->kref, old_notify->release);
+ }
return 0;
}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
#include <linux/idr.h>
#include <linux/irq.h>
#include <linux/math64.h>
+#include <linux/log2.h>
#include <trace/events/irq.h>
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
DEFINE_PER_CPU(struct irq_timings, irq_timings);
-struct irqt_stat {
- u64 next_evt;
- u64 last_ts;
- u64 variance;
- u32 avg;
- u32 nr_samples;
- int anomalies;
- int valid;
-};
-
static DEFINE_IDR(irqt_stats);
void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
static_branch_disable(&irq_timing_enabled);
}
-/**
- * irqs_update - update the irq timing statistics with a new timestamp
+/*
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ * -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ * For each interval:
+ * Compute the index = ilog2(interval)
+ * Compute a new_ema(buffer[index], interval)
+ * Store the index in a circular buffer
+ *
+ * Compute the suffix array of the indexes
+ *
+ * For each suffix:
+ * If the suffix is reverse-found 3 times
+ * Return suffix
+ *
+ * Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
*
- * @irqs: an irqt_stat struct pointer
- * @ts: the new timestamp
+ * Example based on real values:
*
- * The statistics are computed online, in other words, the code is
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
+ * Example 1 : MMC write/read interrupt interval:
*
- * The computation happens at idle time. When the CPU is not idle, the
- * interrupts' timestamps are stored in the circular buffer, when the
- * CPU goes idle and this routine is called, all the buffer's values
- * are injected in the statistical model continuying to extend the
- * statistics from the previous busy-idle cycle.
+ * 223947, 1240, 1384, 1386, 1386,
+ * 217416, 1236, 1384, 1386, 1387,
+ * 214719, 1241, 1386, 1387, 1384,
+ * 213696, 1234, 1384, 1386, 1388,
+ * 219904, 1240, 1385, 1389, 1385,
+ * 212240, 1240, 1386, 1386, 1386,
+ * 214415, 1236, 1384, 1386, 1387,
+ * 214276, 1234, 1384, 1388, ?
*
- * The observations showed a device will trigger a burst of periodic
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
+ * For each element, apply ilog2(value)
*
- * Basically, the purpose of the algorithm is to watch the periodic
- * interrupts and eliminate the peaks.
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
*
- * An interrupt is considered periodically stable if the interval of
- * its occurences follow the normal distribution, thus the values
- * comply with:
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
+ * we can be confident if the pattern repeats itself three times it is
+ * a repeating pattern.
*
- * avg - 3 x stddev < value < avg + 3 x stddev
+ * 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
*
- * Which can be simplified to:
+ * Suffixes are:
*
- * -3 x stddev < value - avg < 3 x stddev
+ * 1) 8, 15, 8, 8, 8 <- max period
+ * 2) 8, 15, 8, 8
+ * 3) 8, 15, 8
+ * 4) 8, 15 <- min period
*
- * abs(value - avg) < 3 x stddev
+ * From there we search the repeating pattern for each suffix.
*
- * In order to save a costly square root computation, we use the
- * variance. For the record, stddev = sqrt(variance). The equation
- * above becomes:
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
+ * | | | | | | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | |
+ * 8, 15, 8, 8, 8
*
- * abs(value - avg) < 3 x sqrt(variance)
+ * When moving the suffix, we found exactly 3 matches.
*
- * And finally we square it:
+ * The first suffix with period 5 is repeating.
*
- * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * The next event is (3 * max_period) % suffix_period
*
- * (value - avg) x (value - avg) < 9 x variance
+ * In this example, the result 0, so the next event is suffix[0] => 8
*
- * Statistically speaking, any values out of this interval is
- * considered as an anomaly and is discarded. However, a normal
- * distribution appears when the number of samples is 30 (it is the
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
+ * However, 8 is the index in the array of exponential moving average
+ * which was calculated on the fly when storing the values, so the
+ * interval is ema[8] = 1366
*
+ *
+ * Example 2:
+ *
+ * 4, 3, 5, 100,
+ * 3, 3, 5, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 5, 3, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 4, 5, 112,
+ * 4, 3, 4, 110
+ *
+ * ilog2
+ *
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Max period 5:
+ * 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ * 1) 0, 0, 4, 0, 0
+ * 2) 0, 0, 4, 0
+ * 3) 0, 0, 4
+ * 4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | X
+ * 0, 0, 4, 0, 0, | X
+ * 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | |
+ * 0, 0, 4, 0, | | |
+ * 0 0 4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL 64
+#define EMA_ALPHA_SHIFT 7
+
+#define PREDICTION_PERIOD_MIN 2
+#define PREDICTION_PERIOD_MAX 5
+#define PREDICTION_FACTOR 4
+#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+
+struct irqt_stat {
+ u64 last_ts;
+ u64 ema_time[PREDICTION_BUFFER_SIZE];
+ int timings[IRQ_TIMINGS_SIZE];
+ int circ_timings[IRQ_TIMINGS_SIZE];
+ int count;
+};
+
+/*
+ * Exponential moving average computation
*/
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+ s64 diff;
+
+ if (unlikely(!ema_old))
+ return value;
+
+ diff = (value - ema_old) * EMA_ALPHA_VAL;
+ /*
+ * We can use a s64 type variable to be added with the u64
+ * ema_old variable as this one will never have its topmost
+ * bit set, it will be always smaller than 2^63 nanosec
+ * interrupt interval (292 years).
+ */
+ return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+ int i;
+
+ /*
+ * The buffer contains the suite of intervals, in a ilog2
+ * basis, we are looking for a repetition. We point the
+ * beginning of the search three times the length of the
+ * period beginning at the end of the buffer. We do that for
+ * each suffix.
+ */
+ for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+
+ int *begin = &buffer[len - (i * 3)];
+ int *ptr = begin;
+
+ /*
+ * We look if the suite with period 'i' repeat
+ * itself. If it is truncated at the end, as it
+ * repeats we can use the period to find out the next
+ * element.
+ */
+ while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+ ptr += i;
+ if (ptr >= &buffer[len])
+ return begin[((i * 3) % i)];
+ }
+ }
+
+ return -1;
+}
+
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+ int index, i, period_max, count, start, min = INT_MAX;
+
+ if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+ irqs->count = irqs->last_ts = 0;
+ return U64_MAX;
+ }
+
+ /*
+ * As we want to find three times the repetition, we need a
+ * number of intervals greater or equal to three times the
+ * maximum period, otherwise we truncate the max period.
+ */
+ period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : irqs->count / 3;
+
+ /*
+ * If we don't have enough irq timings for this prediction,
+ * just bail out.
+ */
+ if (period_max <= PREDICTION_PERIOD_MIN)
+ return U64_MAX;
+
+ /*
+ * 'count' will depends if the circular buffer wrapped or not
+ */
+ count = irqs->count < IRQ_TIMINGS_SIZE ?
+ irqs->count : IRQ_TIMINGS_SIZE;
+
+ start = irqs->count < IRQ_TIMINGS_SIZE ?
+ 0 : (irqs->count & IRQ_TIMINGS_MASK);
+
+ /*
+ * Copy the content of the circular buffer into another buffer
+ * in order to linearize the buffer instead of dealing with
+ * wrapping indexes and shifted array which will be prone to
+ * error and extremelly difficult to debug.
+ */
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+
+ irqs->timings[i] = irqs->circ_timings[index];
+ min = min_t(int, irqs->timings[i], min);
+ }
+
+ index = irq_timings_next_event_index(irqs->timings, count, period_max);
+ if (index < 0)
+ return irqs->last_ts + irqs->ema_time[min];
+
+ return irqs->last_ts + irqs->ema_time[index];
+}
+
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
{
u64 old_ts = irqs->last_ts;
- u64 variance = 0;
u64 interval;
- s64 diff;
+ int index;
/*
* The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
* want as we need another timestamp to compute an interval.
*/
if (interval >= NSEC_PER_SEC) {
- memset(irqs, 0, sizeof(*irqs));
- irqs->last_ts = ts;
+ irqs->count = 0;
return;
}
/*
- * Pre-compute the delta with the average as the result is
- * used several times in this function.
- */
- diff = interval - irqs->avg;
-
- /*
- * Increment the number of samples.
- */
- irqs->nr_samples++;
-
- /*
- * Online variance divided by the number of elements if there
- * is more than one sample. Normally the formula is division
- * by nr_samples - 1 but we assume the number of element will be
- * more than 32 and dividing by 32 instead of 31 is enough
- * precise.
- */
- if (likely(irqs->nr_samples > 1))
- variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
-
- /*
- * The rule of thumb in statistics for the normal distribution
- * is having at least 30 samples in order to have the model to
- * apply. Values outside the interval are considered as an
- * anomaly.
- */
- if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
- /*
- * After three consecutive anomalies, we reset the
- * stats as it is no longer stable enough.
- */
- if (irqs->anomalies++ >= 3) {
- memset(irqs, 0, sizeof(*irqs));
- irqs->last_ts = ts;
- return;
- }
- } else {
- /*
- * The anomalies must be consecutives, so at this
- * point, we reset the anomalies counter.
- */
- irqs->anomalies = 0;
- }
-
- /*
- * The interrupt is considered stable enough to try to predict
- * the next event on it.
+ * Get the index in the ema table for this interrupt. The
+ * PREDICTION_FACTOR increase the interval size for the array
+ * of exponential average.
*/
- irqs->valid = 1;
+ index = likely(interval) ?
+ ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
/*
- * Online average algorithm:
- *
- * new_average = average + ((value - average) / count)
- *
- * The variance computation depends on the new average
- * to be computed here first.
- *
+ * Store the index as an element of the pattern in another
+ * circular array.
*/
- irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
- /*
- * Online variance algorithm:
- *
- * new_variance = variance + (value - average) x (value - new_average)
- *
- * Warning: irqs->avg is updated with the line above, hence
- * 'interval - irqs->avg' is no longer equal to 'diff'
- */
- irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
+ irqs->ema_time[index] = irq_timings_ema_new(interval,
+ irqs->ema_time[index]);
- /*
- * Update the next event
- */
- irqs->next_evt = ts + irqs->avg;
+ irqs->count++;
}
/**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
*/
lockdep_assert_irqs_disabled();
+ if (!irqts->count)
+ return next_evt;
+
/*
* Number of elements in the circular buffer: If it happens it
* was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
* type but with the cost of extra computation in the
* interrupt handler hot path. We choose efficiency.
*
- * Inject measured irq/timestamp to the statistical model
- * while decrementing the counter because we consume the data
- * from our circular buffer.
+ * Inject measured irq/timestamp to the pattern prediction
+ * model while decrementing the counter because we consume the
+ * data from our circular buffer.
*/
- for (i = irqts->count & IRQ_TIMINGS_MASK,
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
- irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
- irq = irq_timing_decode(irqts->values[i], &ts);
+ i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+ irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+ for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+ irq = irq_timing_decode(irqts->values[i], &ts);
s = idr_find(&irqt_stats, irq);
- if (s) {
- irqs = this_cpu_ptr(s);
- irqs_update(irqs, ts);
- }
+ if (s)
+ irq_timings_store(irq, this_cpu_ptr(s), ts);
}
/*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
irqs = this_cpu_ptr(s);
- if (!irqs->valid)
- continue;
+ ts = __irq_timings_next_event(irqs, i, now);
+ if (ts <= now)
+ return now;
- if (irqs->next_evt <= now) {
- irq = i;
- next_evt = now;
-
- /*
- * This interrupt mustn't use in the future
- * until new events occur and update the
- * statistics.
- */
- irqs->valid = 0;
- break;
- }
-
- if (irqs->next_evt < next_evt) {
- irq = i;
- next_evt = irqs->next_evt;
- }
+ if (ts < next_evt)
+ next_evt = ts;
}
return next_evt;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
*/
}
-/*
- * Enqueue the irq_work @work on @cpu unless it's already pending
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
+static void __irq_work_queue_local(struct irq_work *work)
{
- /* All work should have been flushed before going offline */
- WARN_ON_ONCE(cpu_is_offline(cpu));
-
-#ifdef CONFIG_SMP
-
- /* Arch remote IPI send/receive backend aren't NMI safe */
- WARN_ON_ONCE(in_nmi());
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+ arch_irq_work_raise();
+ }
+}
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
-
-#else /* #ifdef CONFIG_SMP */
- irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
+ /* Queue the entry and raise the IPI if needed. */
+ preempt_disable();
+ __irq_work_queue_local(work);
+ preempt_enable();
return true;
}
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/* Enqueue the irq work @work on the current CPU */
-bool irq_work_queue(struct irq_work *work)
+/*
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
{
+#ifndef CONFIG_SMP
+ return irq_work_queue(work);
+
+#else /* CONFIG_SMP: */
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
- /* Queue the entry and raise the IPI if needed. */
preempt_disable();
-
- /* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ if (cpu != smp_processor_id()) {
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
} else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
+ __irq_work_queue_local(work);
}
-
preempt_enable();
return true;
+#endif /* CONFIG_SMP */
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
+
bool irq_work_needs_cpu(void)
{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bad96b476eb6..de6efdecc70d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void __static_key_slow_dec_cpuslocked(struct static_key *key,
- unsigned long rate_limit,
- struct delayed_work *work)
+static bool static_key_slow_try_dec(struct static_key *key)
{
- lockdep_assert_cpus_held();
+ int val;
+
+ val = atomic_fetch_add_unless(&key->enabled, -1, 1);
+ if (val == 1)
+ return false;
/*
* The negative count check is valid even when a negative
@@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
* returns is unbalanced, because all other static_key_slow_inc()
* instances block while the update is in progress.
*/
- if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
- WARN(atomic_read(&key->enabled) < 0,
- "jump label: negative count!\n");
+ WARN(val < 0, "jump label: negative count!\n");
+ return true;
+}
+
+static void __static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+
+ if (static_key_slow_try_dec(key))
return;
- }
- if (rate_limit) {
- atomic_inc(&key->enabled);
- schedule_delayed_work(work, rate_limit);
- } else {
+ jump_label_lock();
+ if (atomic_dec_and_test(&key->enabled))
jump_label_update(key);
- }
jump_label_unlock();
}
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit,
- struct delayed_work *work)
+static void __static_key_slow_dec(struct static_key *key)
{
cpus_read_lock();
- __static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key);
cpus_read_unlock();
}
-static void jump_label_update_timeout(struct work_struct *work)
+void jump_label_update_timeout(struct work_struct *work)
{
struct static_key_deferred *key =
container_of(work, struct static_key_deferred, work.work);
- __static_key_slow_dec(&key->key, 0, NULL);
+ __static_key_slow_dec(&key->key);
}
+EXPORT_SYMBOL_GPL(jump_label_update_timeout);
void static_key_slow_dec(struct static_key *key)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec(key, 0, NULL);
+ __static_key_slow_dec(key);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_cpuslocked(struct static_key *key)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec_cpuslocked(key, 0, NULL);
+ __static_key_slow_dec_cpuslocked(key);
}
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
+void __static_key_slow_dec_deferred(struct static_key *key,
+ struct delayed_work *work,
+ unsigned long timeout)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec(&key->key, key->timeout, &key->work);
+
+ if (static_key_slow_try_dec(key))
+ return;
+
+ schedule_delayed_work(work, timeout);
}
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
-void static_key_deferred_flush(struct static_key_deferred *key)
+void __static_key_deferred_flush(void *key, struct delayed_work *work)
{
STATIC_KEY_CHECK_USE(key);
- flush_delayed_work(&key->work);
+ flush_delayed_work(work);
}
-EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d7140447be75..fd5c95ff9251 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
error = dpm_suspend_end(PMSG_FREEZE);
if (error)
goto Resume_devices;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
Enable_irqs:
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..99a5b5f46dc5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry marks end of backtrace: */
+ if (!record)
break;
}
if (same) {
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
- struct latency_record *lat)
-{
- struct stack_trace trace;
-
- memset(&trace, 0, sizeof(trace));
- trace.max_entries = LT_BACKTRACEDEPTH;
- trace.entries = &lat->backtrace[0];
- save_stack_trace_tsk(tsk, &trace);
-}
-
/**
* __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
- store_stacktrace(tsk, &lat);
+
+ stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
raw_spin_lock_irqsave(&latency_lock, flags);
@@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry is end of backtrace */
+ if (!record)
break;
}
if (same) {
@@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
+
seq_printf(m, " %ps", (void *)bt);
}
seq_puts(m, "\n");
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 9c89ae8b337a..c53370d596be 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)
* Determine whether the given stack trace includes any references to a
* to-be-patched or to-be-unpatched function.
*/
-static int klp_check_stack_func(struct klp_func *func,
- struct stack_trace *trace)
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
+ unsigned int nr_entries)
{
unsigned long func_addr, func_size, address;
struct klp_ops *ops;
int i;
- for (i = 0; i < trace->nr_entries; i++) {
- address = trace->entries[i];
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (klp_target_state == KLP_UNPATCHED) {
/*
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,
static int klp_check_stack(struct task_struct *task, char *err_buf)
{
static unsigned long entries[MAX_STACK_ENTRIES];
- struct stack_trace trace;
struct klp_object *obj;
struct klp_func *func;
- int ret;
+ int ret, nr_entries;
- trace.skip = 0;
- trace.nr_entries = 0;
- trace.max_entries = MAX_STACK_ENTRIES;
- trace.entries = entries;
- ret = save_stack_trace_tsk_reliable(task, &trace);
+ ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
WARN_ON_ONCE(ret == -ENOSYS);
- if (ret) {
+ if (ret < 0) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d has an unreliable stack\n",
__func__, task->comm, task->pid);
return ret;
}
+ nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
if (!obj->patched)
continue;
klp_for_each_func(obj, func) {
- ret = klp_check_stack_func(func, &trace);
+ ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d is sleeping on function %s\n",
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#include "lock_events.h"
+
+#undef LOCK_EVENT
+#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
+
+#define LOCK_EVENTS_DIR "lock_event_counts"
+
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+ [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, id, len;
+ u64 sum = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ id = (long)file_inode(file)->i_private;
+
+ if (id >= lockevent_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(lockevents[id], cpu);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+
+ for (i = 0 ; i < lockevent_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+ .read = lockevent_read,
+ .write = lockevent_write,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+
+static bool __init skip_lockevent(const char *name)
+{
+ static int pv_on __initdata = -1;
+
+ if (pv_on < 0)
+ pv_on = !pv_is_native_spin_unlock();
+ /*
+ * Skip PV qspinlock events on bare metal.
+ */
+ if (!pv_on && !memcmp(name, "pv_", 3))
+ return true;
+ return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+ return false;
+}
+#endif
+
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+ struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+ int i;
+
+ if (!d_counts)
+ goto out;
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < lockevent_num; i++) {
+ if (skip_lockevent(lockevent_names[i]))
+ continue;
+ if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent))
+ goto fail_undo;
+ }
+
+ if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+ &fops_lockevent))
+ goto fail_undo;
+
+ return 0;
+fail_undo:
+ debugfs_remove_recursive(d_counts);
+out:
+ pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+ return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..feb1acc54611
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+
+enum lock_events {
+
+#include "lock_events_list.h"
+
+ lockevent_num, /* Total number of lock event counts */
+ LOCKEVENT_reset_cnts = lockevent_num,
+};
+
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+ if (cond)
+ __this_cpu_inc(lockevents[event]);
+}
+
+#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+ __this_cpu_add(lockevents[event], inc);
+}
+
+#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
+
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c)
+#define lockevent_cond_inc(ev, c)
+
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..ad7668cfc9da
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name) LOCKEVENT_ ## name,
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
+LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
+LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
+LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
+LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
+LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
+LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
+LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
+LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
+LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
+LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
+LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
+LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
+LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
+LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
+LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
+LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */
+LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
+LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
+LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e221be724fe8..d06190fa5082 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
-static int save_trace(struct stack_trace *trace)
+static int save_trace(struct lock_trace *trace)
{
- trace->nr_entries = 0;
- trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->entries = stack_trace + nr_stack_trace_entries;
-
- trace->skip = 3;
-
- save_stack_trace(trace);
-
- /*
- * Some daft arches put -1 at the end to indicate its a full trace.
- *
- * <rant> this is buggy anyway, since it takes a whole extra entry so a
- * complete trace that maxes out the entries provided will be reported
- * as incomplete, friggin useless </rant>
- */
- if (trace->nr_entries != 0 &&
- trace->entries[trace->nr_entries-1] == ULONG_MAX)
- trace->nr_entries--;
-
- trace->max_entries = trace->nr_entries;
+ unsigned long *entries = stack_trace + nr_stack_trace_entries;
+ unsigned int max_entries;
+ trace->offset = nr_stack_trace_entries;
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->nr_entries = stack_trace_save(entries, max_entries, 3);
nr_stack_trace_entries += trace->nr_entries;
if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -516,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
char c = '.';
- if (class->usage_mask & lock_flag(bit + 2))
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
c = '+';
if (class->usage_mask & lock_flag(bit)) {
c = '-';
- if (class->usage_mask & lock_flag(bit + 2))
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
c = '?';
}
@@ -649,6 +634,9 @@ static int static_obj(const void *obj)
end = (unsigned long) &_end,
addr = (unsigned long) obj;
+ if (arch_is_kernel_initmem_freed(addr))
+ return 0;
+
/*
* static variable?
*/
@@ -1207,7 +1195,7 @@ static struct lock_list *alloc_list_entry(void)
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct stack_trace *trace)
+ struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -1426,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
* checking.
*/
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+ unsigned long *entries = stack_trace + trace->offset;
+
+ stack_trace_print(entries, trace->nr_entries, spaces);
+}
+
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
@@ -1438,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_stack_trace(&target->trace, 6);
-
+ print_lock_trace(&target->trace, 6);
return 0;
}
@@ -1533,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data)
}
static noinline int print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt,
- struct stack_trace *trace)
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1676,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,
}
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+static inline int usage_accumulate(struct lock_list *entry, void *mask)
+{
+ *(unsigned long *)mask |= entry->class->usage_mask;
+
+ return 0;
+}
+
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
*/
-static inline int usage_match(struct lock_list *entry, void *bit)
+static inline int usage_match(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+ return entry->class->usage_mask & *(unsigned long *)mask;
}
-
-
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
@@ -1700,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
* Return <0 on error.
*/
static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
int result;
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
return result;
}
@@ -1723,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
* Return <0 on error.
*/
static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
int result;
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
return result;
}
@@ -1752,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_stack_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces + bit, len);
}
}
printk("%*s }\n", depth, "");
@@ -1777,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_stack_trace(&entry->trace, 2);
+ print_lock_trace(&entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1890,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -1922,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr,
return 0;
}
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit_backwards,
- enum lock_usage_bit bit_forwards, const char *irqclass)
-{
- int ret;
- struct lock_list this, that;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list *uninitialized_var(target_entry1);
-
- this.parent = NULL;
-
- this.class = hlock_class(prev);
- ret = find_usage_backwards(&this, bit_backwards, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- that.parent = NULL;
- that.class = hlock_class(next);
- ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- bit_backwards, bit_forwards, irqclass);
-}
-
static const char *state_names[] = {
#define LOCKDEP_STATE(__STATE) \
__stringify(__STATE),
@@ -1971,9 +1937,19 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ if (bit & LOCK_USAGE_READ_MASK)
+ return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+ else
+ return state_names[bit >> LOCK_USAGE_DIR_MASK];
}
+/*
+ * The bit number is encoded like:
+ *
+ * bit0: 0 exclusive, 1 read lock
+ * bit1: 0 used in irq, 1 irq enabled
+ * bit2-n: state
+ */
static int exclusive_bit(int new_bit)
{
int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1985,45 +1961,160 @@ static int exclusive_bit(int new_bit)
return state | (dir ^ LOCK_USAGE_DIR_MASK);
}
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+ unsigned long excl = 0;
+
+ /* Invert dir */
+ excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+ excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+
+ return excl;
+}
+
+/*
+ * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
+ * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
+ * And then mask out all bitnr0.
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Strip read */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl &= ~LOCKF_IRQ_READ;
+
+ return excl;
+}
+
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+ unsigned long excl_mask,
+ enum lock_usage_bit *bitp,
+ enum lock_usage_bit *excl_bitp)
+{
+ int bit, excl;
+
+ for_each_set_bit(bit, &mask, LOCK_USED) {
+ excl = exclusive_bit(bit);
+ if (excl_mask & lock_flag(excl)) {
+ *bitp = bit;
+ *excl_bitp = excl;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit)
+ struct held_lock *next)
{
+ unsigned long usage_mask = 0, forward_mask, backward_mask;
+ enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+ struct lock_list *uninitialized_var(target_entry1);
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list this, that;
+ int ret;
+
/*
- * Prove that the new dependency does not connect a hardirq-safe
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 1: gather all hard/soft IRQs usages backward in an
+ * accumulated usage mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
- return 0;
+ this.parent = NULL;
+ this.class = hlock_class(prev);
- bit++; /* _READ */
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+
+ usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+ if (!usage_mask)
+ return 1;
/*
- * Prove that the new dependency does not connect a hardirq-safe-read
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 2: find exclusive uses forward that match the previous
+ * backward accumulated mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
- return 0;
+ forward_mask = exclusive_mask(usage_mask);
- return 1;
-}
+ that.parent = NULL;
+ that.class = hlock_class(next);
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
-{
-#define LOCKDEP_STATE(__STATE) \
- if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
- return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ ret = find_usage_forwards(&that, forward_mask, &target_entry1);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
- return 1;
+ /*
+ * Step 3: we found a bad match! Now retrieve a lock from the backward
+ * list whose usage mask matches the exclusive usage mask from the
+ * lock found on the forward list.
+ */
+ backward_mask = original_mask(target_entry1->class->usage_mask);
+
+ ret = find_usage_backwards(&this, backward_mask, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (DEBUG_LOCKS_WARN_ON(ret == 1))
+ return 1;
+
+ /*
+ * Step 4: narrow down to a pair of incompatible usage bits
+ * and report it.
+ */
+ ret = find_exclusive_match(target_entry->class->usage_mask,
+ target_entry1->class->usage_mask,
+ &backward_bit, &forward_bit);
+ if (DEBUG_LOCKS_WARN_ON(ret == -1))
+ return 1;
+
+ return print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
}
static void inc_chains(void)
@@ -2040,9 +2131,8 @@ static void inc_chains(void)
#else
-static inline int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline int check_irq_usage(struct task_struct *curr,
+ struct held_lock *prev, struct held_lock *next)
{
return 1;
}
@@ -2170,8 +2260,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct stack_trace *trace,
- int (*save)(struct stack_trace *trace))
+ struct held_lock *next, int distance, struct lock_trace *trace)
{
struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
@@ -2209,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
if (unlikely(!ret)) {
- if (!trace->entries) {
+ if (!trace->nr_entries) {
/*
- * If @save fails here, the printing might trigger
- * a WARN but because of the !nr_entries it should
- * not do bad things.
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
*/
- save(trace);
+ save_trace(trace);
}
- return print_circular_bug(&this, target_entry, next, prev, trace);
+ return print_circular_bug(&this, target_entry, next, prev);
}
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
- if (!check_prev_add_irq(curr, prev, next))
+ if (!check_irq_usage(curr, prev, next))
return 0;
/*
@@ -2265,7 +2354,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return print_bfs_bug(ret);
- if (!trace->entries && !save(trace))
+ if (!trace->nr_entries && !save_trace(trace))
return 0;
/*
@@ -2297,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_trace trace = { .nr_entries = 0 };
int depth = curr->lockdep_depth;
struct held_lock *hlock;
- struct stack_trace trace = {
- .nr_entries = 0,
- .max_entries = 0,
- .entries = NULL,
- .skip = 0,
- };
/*
* Debugging checks.
@@ -2330,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
* added:
*/
if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ int ret = check_prev_add(curr, hlock, next, distance,
+ &trace);
if (!ret)
return 0;
@@ -2731,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr,
{
return 1;
}
+
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+}
#endif
/*
@@ -2784,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+
static void
print_usage_bug_scenario(struct held_lock *lock)
{
@@ -2827,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -2853,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
return 1;
}
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit);
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* print irq inversion bug:
@@ -2936,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
- ret = find_usage_forwards(&root, bit, &target_entry);
+ ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
if (ret < 0)
return print_bfs_bug(ret);
if (ret == 1)
@@ -2960,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
- ret = find_usage_backwards(&root, bit, &target_entry);
+ ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
if (ret < 0)
return print_bfs_bug(ret);
if (ret == 1)
@@ -3015,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
static inline int state_verbose(enum lock_usage_bit bit,
struct lock_class *class)
{
- return state_verbose_f[bit >> 2](class);
+ return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
}
typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -3157,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip)
/*
* See the fine text that goes along with this variable definition.
*/
- if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
return;
/*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,35 @@ enum {
__LOCKF(USED)
};
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
+static const unsigned long LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
+static const unsigned long LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
+static const unsigned long LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
+static const unsigned long LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_ENABLED_IRQ_READ \
- (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
- (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
/*
* CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ad40a2617063..80a463d31a8d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)
"End of test: SUCCESS");
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
kfree(cxt.lrsa);
+ cxt.lrsa = NULL;
end:
torture_cleanup_end();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
#include <linux/sched.h>
#include <linux/errno.h>
+#include "rwsem.h"
+
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* 0,1,0 -> 0,0,1
*/
clear_pending_set_locked(lock);
- qstat_inc(qstat_lock_pending, true);
+ lockevent_inc(lock_pending);
return;
/*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* queuing.
*/
queue:
- qstat_inc(qstat_lock_slowpath, true);
+ lockevent_inc(lock_slowpath);
pv_queue:
node = this_cpu_ptr(&qnodes[0].mcs);
idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
* simple enough.
*/
if (unlikely(idx >= MAX_NODES)) {
- qstat_inc(qstat_lock_no_node, true);
+ lockevent_inc(lock_no_node);
while (!queued_spin_trylock(lock))
cpu_relax();
goto release;
@@ -430,7 +430,7 @@ pv_queue:
/*
* Keep counts of non-zero index values:
*/
- qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
/*
* Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
if (!(val & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
+ lockevent_inc(pv_lock_stealing);
return true;
}
if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
- qstat_hop(hopcnt);
+ lockevent_pv_hop(hopcnt);
return &he->lock;
}
}
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
smp_store_mb(pn->state, vcpu_halted);
if (!READ_ONCE(node->locked)) {
- qstat_inc(qstat_pv_wait_node, true);
- qstat_inc(qstat_pv_wait_early, wait_early);
+ lockevent_inc(pv_wait_node);
+ lockevent_cond_inc(pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
}
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
- qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+ lockevent_cond_inc(pv_spurious_wakeup,
+ !READ_ONCE(node->locked));
}
/*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
/*
* Tracking # of slowpath locking operations
*/
- qstat_inc(qstat_lock_slowpath, true);
+ lockevent_inc(lock_slowpath);
for (;; waitcnt++) {
/*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
}
}
WRITE_ONCE(pn->state, vcpu_hashed);
- qstat_inc(qstat_pv_wait_head, true);
- qstat_inc(qstat_pv_wait_again, waitcnt);
+ lockevent_inc(pv_wait_head);
+ lockevent_cond_inc(pv_wait_again, waitcnt);
pv_wait(&lock->locked, _Q_SLOW_VAL);
/*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
- qstat_inc(qstat_pv_kick_unlock, true);
+ lockevent_inc(pv_kick_unlock);
pv_kick(node->cpu);
}
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,262 +9,105 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
*/
-/*
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- * pv_hash_hops - average # of hops per hashing operation
- * pv_kick_unlock - # of vCPU kicks issued at unlock time
- * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
- * pv_latency_kick - average latency (ns) of vCPU kick operation
- * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
- * pv_lock_stealing - # of lock stealing operations
- * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- * pv_wait_again - # of wait's after a queue head vCPU kick
- * pv_wait_early - # of early vCPU wait's
- * pv_wait_head - # of vCPU wait's at the queue head
- * pv_wait_node - # of vCPU wait's at a non-head queue node
- * lock_pending - # of locking operations via pending code
- * lock_slowpath - # of locking operations via MCS lock queue
- * lock_use_node2 - # of locking operations that use 2nd per-CPU node
- * lock_use_node3 - # of locking operations that use 3rd per-CPU node
- * lock_use_node4 - # of locking operations that use 4th per-CPU node
- * lock_no_node - # of locking operations without using per-CPU node
- *
- * Subtracting lock_use_node[234] from lock_slowpath will give you
- * lock_use_node1.
- *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
-enum qlock_stats {
- qstat_pv_hash_hops,
- qstat_pv_kick_unlock,
- qstat_pv_kick_wake,
- qstat_pv_latency_kick,
- qstat_pv_latency_wake,
- qstat_pv_lock_stealing,
- qstat_pv_spurious_wakeup,
- qstat_pv_wait_again,
- qstat_pv_wait_early,
- qstat_pv_wait_head,
- qstat_pv_wait_node,
- qstat_lock_pending,
- qstat_lock_slowpath,
- qstat_lock_use_node2,
- qstat_lock_use_node3,
- qstat_lock_use_node4,
- qstat_lock_no_node,
- qstat_num, /* Total number of statistical counters */
- qstat_reset_cnts = qstat_num,
-};
+#include "lock_events.h"
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
/*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
*/
-#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/fs.h>
-static const char * const qstat_names[qstat_num + 1] = {
- [qstat_pv_hash_hops] = "pv_hash_hops",
- [qstat_pv_kick_unlock] = "pv_kick_unlock",
- [qstat_pv_kick_wake] = "pv_kick_wake",
- [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
- [qstat_pv_latency_kick] = "pv_latency_kick",
- [qstat_pv_latency_wake] = "pv_latency_wake",
- [qstat_pv_lock_stealing] = "pv_lock_stealing",
- [qstat_pv_wait_again] = "pv_wait_again",
- [qstat_pv_wait_early] = "pv_wait_early",
- [qstat_pv_wait_head] = "pv_wait_head",
- [qstat_pv_wait_node] = "pv_wait_node",
- [qstat_lock_pending] = "lock_pending",
- [qstat_lock_slowpath] = "lock_slowpath",
- [qstat_lock_use_node2] = "lock_use_node2",
- [qstat_lock_use_node3] = "lock_use_node3",
- [qstat_lock_use_node4] = "lock_use_node4",
- [qstat_lock_no_node] = "lock_no_node",
- [qstat_reset_cnts] = "reset_counters",
-};
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
/*
- * Per-cpu counters
+ * PV specific per-cpu counter
*/
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
static DEFINE_PER_CPU(u64, pv_kick_time);
/*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
*
* The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
* Average hops/hash = pv_hash_hops/pv_kick_unlock
*/
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[64];
- int cpu, counter, len;
- u64 stat = 0, kicks = 0;
+ int cpu, id, len;
+ u64 sum = 0, kicks = 0;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- counter = (long)file_inode(file)->i_private;
+ id = (long)file_inode(file)->i_private;
- if (counter >= qstat_num)
+ if (id >= lockevent_num)
return -EBADF;
for_each_possible_cpu(cpu) {
- stat += per_cpu(qstats[counter], cpu);
+ sum += per_cpu(lockevents[id], cpu);
/*
- * Need to sum additional counter for some of them
+ * Need to sum additional counters for some of them
*/
- switch (counter) {
+ switch (id) {
- case qstat_pv_latency_kick:
- case qstat_pv_hash_hops:
- kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ case LOCKEVENT_pv_latency_kick:
+ case LOCKEVENT_pv_hash_hops:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
break;
- case qstat_pv_latency_wake:
- kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ case LOCKEVENT_pv_latency_wake:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
break;
}
}
- if (counter == qstat_pv_hash_hops) {
+ if (id == LOCKEVENT_pv_hash_hops) {
u64 frac = 0;
if (kicks) {
- frac = 100ULL * do_div(stat, kicks);
+ frac = 100ULL * do_div(sum, kicks);
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
}
/*
* Return a X.XX decimal number
*/
- len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+ sum, frac);
} else {
/*
* Round to the nearest ns
*/
- if ((counter == qstat_pv_latency_kick) ||
- (counter == qstat_pv_latency_wake)) {
+ if ((id == LOCKEVENT_pv_latency_kick) ||
+ (id == LOCKEVENT_pv_latency_wake)) {
if (kicks)
- stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
}
- len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
}
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
/*
- * Function to handle write request
- *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
- */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- int cpu;
-
- /*
- * Get the counter ID stored in file->f_inode->i_private
- */
- if ((long)file_inode(file)->i_private != qstat_reset_cnts)
- return count;
-
- for_each_possible_cpu(cpu) {
- int i;
- unsigned long *ptr = per_cpu_ptr(qstats, cpu);
-
- for (i = 0 ; i < qstat_num; i++)
- WRITE_ONCE(ptr[i], 0);
- }
- return count;
-}
-
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_qstat = {
- .read = qstat_read,
- .write = qstat_write,
- .llseek = default_llseek,
-};
-
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
- struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
- int i;
-
- if (!d_qstat)
- goto out;
-
- /*
- * Create the debugfs files
- *
- * As reading from and writing to the stat files can be slow, only
- * root is allowed to do the read/write to limit impact to system
- * performance.
- */
- for (i = 0; i < qstat_num; i++)
- if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
- (void *)(long)i, &fops_qstat))
- goto fail_undo;
-
- if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
- (void *)(long)qstat_reset_cnts, &fops_qstat))
- goto fail_undo;
-
- return 0;
-fail_undo:
- debugfs_remove_recursive(d_qstat);
-out:
- pr_warn("Could not create 'qlockstat' debugfs entries\n");
- return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
- if (cond)
- this_cpu_inc(qstats[stat]);
-}
-
-/*
* PV hash hop count
*/
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
{
- this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+ this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
}
/*
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)
per_cpu(pv_kick_time, cpu) = start;
pv_kick(cpu);
- this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+ this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
}
/*
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
*pkick_time = 0;
pv_wait(ptr, val);
if (*pkick_time) {
- this_cpu_add(qstats[qstat_pv_latency_wake],
+ this_cpu_add(EVENT_COUNT(pv_latency_wake),
sched_clock() - *pkick_time);
- qstat_inc(qstat_pv_kick_wake, true);
+ lockevent_inc(pv_kick_wake);
}
}
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
-static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
-static inline void qstat_hop(int hopcnt) { }
+static inline void lockevent_pv_hop(int hopcnt) { }
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001 David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
- int ret = 1;
- unsigned long flags;
-
- if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->count != 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- }
- return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = 0;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- * - the 'active count' _reached_ zero
- * - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- int woken;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wakewrite)
- /* Wake up a writer. Note that we do not grant it the
- * lock - it will have to acquire it when it runs. */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* grant an infinite number of read locks to the front of the queue */
- woken = 0;
- do {
- struct list_head *next = waiter->list.next;
-
- list_del(&waiter->list);
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- woken++;
- if (next == &sem->wait_list)
- break;
- waiter = list_entry(next, struct rwsem_waiter, list);
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- sem->count += woken;
-
- out:
- return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
- struct rwsem_waiter *waiter;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- wake_up_process(waiter->task);
-
- return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-int __sched __down_read_common(struct rw_semaphore *sem, int state)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- goto out;
- }
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(current);
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait to be given the lock */
- for (;;) {
- if (!waiter.task)
- break;
- if (signal_pending_state(state, current))
- goto out_nolock;
- set_current_state(state);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- out:
- return 0;
-
-out_nolock:
- /*
- * We didn't take the lock, so that there is a writer, which
- * is owner or the first waiter of the sem. If it's a waiter,
- * it will be woken by current owner. Not need to wake anybody.
- */
- list_del(&waiter.list);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- return -EINTR;
-}
-
-void __sched __down_read(struct rw_semaphore *sem)
-{
- __down_read_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_read_killable(struct rw_semaphore *sem)
-{
- return __down_read_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait for someone to release the lock */
- for (;;) {
- /*
- * That is the key to support write lock stealing: allows the
- * task already on CPU to get the lock soon rather than put
- * itself into sleep and waiting for system woke it or someone
- * else in the head of the wait list up.
- */
- if (sem->count == 0)
- break;
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- set_current_state(state);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
- /* got the lock */
- sem->count = -1;
- list_del(&waiter.list);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-
-out_nolock:
- list_del(&waiter.list);
- if (!list_empty(&sem->wait_list) && sem->count >= 0)
- __rwsem_do_wake(sem, 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return -EINTR;
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
- return __down_write_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count == 0) {
- /* got the lock */
- sem->count = -1;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (--sem->count == 0 && !list_empty(&sem->wait_list))
- sem = __rwsem_wake_one_writer(sem);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 0;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 1);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 1;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 0);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
* will notice the queued writer.
*/
wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
}
return;
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
goto try_reader_grant;
}
/*
- * It is not really necessary to set it to reader-owned here,
- * but it gives the spinners an early indication that the
- * readers now have the lock.
+ * Set it to reader-owned to give spinners an early
+ * indication that readers now have the lock.
*/
__rwsem_set_reader_owned(sem, waiter->task);
}
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
}
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+ lockevent_cond_inc(rwsem_wake_reader, woken);
if (list_empty(&sem->wait_list)) {
/* hit end of list above */
adjustment -= RWSEM_WAITING_BIAS;
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- DEFINE_WAKE_Q(wake_q);
-
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
- /*
- * In case the wait queue is empty and the lock isn't owned
- * by a writer, this reader can exit the slowpath and return
- * immediately as its RWSEM_ACTIVE_READ_BIAS has already
- * been set in the count.
- */
- if (atomic_long_read(&sem->count) >= 0) {
- raw_spin_unlock_irq(&sem->wait_lock);
- return sem;
- }
- adjustment += RWSEM_WAITING_BIAS;
- }
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = atomic_long_add_return(adjustment, &sem->count);
-
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- /* wait to be given the lock */
- while (true) {
- set_current_state(state);
- if (!waiter.task)
- break;
- if (signal_pending_state(state, current)) {
- raw_spin_lock_irq(&sem->wait_lock);
- if (waiter.task)
- goto out_nolock;
- raw_spin_unlock_irq(&sem->wait_lock);
- break;
- }
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- return sem;
-out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- raw_spin_unlock_irq(&sem->wait_lock);
- __set_current_state(TASK_RUNNING);
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-
-/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = atomic_long_read(&sem->count);
-
- while (true) {
- if (!(count == 0 || count == RWSEM_WAITING_BIAS))
- return false;
+ long count = atomic_long_read(&sem->count);
- old = atomic_long_cmpxchg_acquire(&sem->count, count,
- count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count) {
+ while (!count || count == RWSEM_WAITING_BIAS) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+ count + RWSEM_ACTIVE_WRITE_BIAS)) {
rwsem_set_owner(sem);
+ lockevent_inc(rwsem_opt_wlock);
return true;
}
-
- count = old;
}
+ return false;
}
static inline bool owner_on_cpu(struct task_struct *owner)
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
osq_unlock(&sem->osq);
done:
preempt_enable();
+ lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
#endif
/*
+ * Wait for the read lock to be granted
+ */
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+{
+ long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * In case the wait queue is empty and the lock isn't owned
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_ACTIVE_READ_BIAS has already
+ * been set in the count.
+ */
+ if (atomic_long_read(&sem->count) >= 0) {
+ raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_fast);
+ return sem;
+ }
+ adjustment += RWSEM_WAITING_BIAS;
+ }
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = atomic_long_add_return(adjustment, &sem->count);
+
+ /*
+ * If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (count == RWSEM_WAITING_BIAS ||
+ (count > RWSEM_WAITING_BIAS &&
+ adjustment != -RWSEM_ACTIVE_READ_BIAS))
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_current_state(state);
+ if (!waiter.task)
+ break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
+ schedule();
+ lockevent_inc(rwsem_sleep_reader);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock);
+ return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock_fail);
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
+/*
* Wait until we successfully acquire the write lock
*/
static inline struct rw_semaphore *
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
goto out_nolock;
schedule();
+ lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
__set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
+ lockevent_inc(rwsem_wlock);
return ret;
@@ -601,6 +606,7 @@ out_nolock:
__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
+ lockevent_inc(rwsem_wlock_fail);
return ERR_PTR(-EINTR);
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
return -EINTR;
}
- rwsem_set_reader_owned(sem);
return 0;
}
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_reader_owned(sem);
- }
return ret;
}
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_owner(sem);
- }
return ret;
}
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);
void up_read(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
- rwsem_clear_reader_owned(sem);
__up_read(sem);
}
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);
void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(sem->owner != current);
- rwsem_clear_owner(sem);
__up_write(sem);
}
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(sem->owner != current);
- rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+ DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+ sem);
__up_read(sem);
}
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,15 +23,44 @@
* is involved. Ideally we would like to track all the readers that own
* a rwsem, but the overhead is simply too big.
*/
+#include "lock_events.h"
+
#define RWSEM_READER_OWNED (1UL << 0)
#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
#ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
+# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
+ if (!debug_locks_silent && \
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ #c, atomic_long_read(&(sem)->count), \
+ (long)((sem)->owner), (long)current, \
+ list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ debug_locks_off(); \
+ } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+
+/*
+ * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
+ * Adapted largely from include/asm-i386/rwsem.h
+ * by Paul Mackerras <paulus@samba.org>.
+ */
+
+/*
+ * the semaphore definition
+ */
+#ifdef CONFIG_64BIT
+# define RWSEM_ACTIVE_MASK 0xffffffffL
#else
-# define DEBUG_RWSEMS_WARN_ON(c)
+# define RWSEM_ACTIVE_MASK 0x0000ffffL
#endif
+#define RWSEM_ACTIVE_BIAS 0x00000001L
+#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
{
}
#endif
+
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+ rwsem_down_read_failed(sem);
+ DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+ RWSEM_READER_OWNED), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+ if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+ return -EINTR;
+ DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+ RWSEM_READER_OWNED), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+ return 0;
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ /*
+ * Optimize for the case when the rwsem is not locked at all.
+ */
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ lockevent_inc(rwsem_rtrylock);
+ do {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ tmp + RWSEM_ACTIVE_READ_BIAS)) {
+ rwsem_set_reader_owned(sem);
+ return 1;
+ }
+ } while (tmp >= 0);
+ return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+ &sem->count);
+ if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+ rwsem_down_write_failed(sem);
+ rwsem_set_owner(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+ &sem->count);
+ if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+ if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+ return -EINTR;
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ lockevent_inc(rwsem_wtrylock);
+ tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
+ if (tmp == RWSEM_UNLOCKED_VALUE) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+ sem);
+ rwsem_clear_reader_owned(sem);
+ tmp = atomic_long_dec_return_release(&sem->count);
+ if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
+ rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+ rwsem_clear_owner(sem);
+ if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
+ &sem->count) < 0))
+ rwsem_wake(sem);
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * When downgrading from exclusive to shared ownership,
+ * anything inside the write-locked region cannot leak
+ * into the read side. In contrast, anything in the
+ * read-locked region is ok to be re-ordered into the
+ * write side. As such, rely on RELEASE semantics.
+ */
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+ tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
+ rwsem_set_reader_owned(sem);
+ if (tmp < 0)
+ rwsem_downgrade_wake(sem);
+}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 936f3d14dd6b..0ff08380f531 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,13 @@
#include <linux/debug_locks.h>
#include <linux/export.h>
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
+
/*
* If lockdep is enabled then we use the non-preemption spin-ops
* even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..399669f7eba8 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
arch_spin_lock(&lock->raw_lock);
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
}
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
{
int ret = arch_spin_trylock(&lock->raw_lock);
- if (ret)
+ if (ret) {
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
+ }
#ifndef CONFIG_SMP
/*
* Must not happen on UP:
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
void do_raw_spin_unlock(raw_spinlock_t *lock)
{
+ mmiowb_spin_unlock();
debug_spin_unlock(lock);
arch_spin_unlock(&lock->raw_lock);
}
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+/* Work queue for freeing init sections in success case */
+static struct work_struct init_free_wq;
+static struct llist_head init_free_list;
+
#ifdef CONFIG_MODULES_TREE_LOOKUP
/*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
if (!rodata_enabled)
return;
+ set_vm_flush_reset_perms(mod->core_layout.base);
+ set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->core_layout, set_memory_x);
+
frob_rodata(&mod->core_layout, set_memory_ro);
+
frob_text(&mod->init_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_x);
+
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void module_disable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_x);
- frob_ro_after_init(&mod->core_layout, set_memory_x);
- frob_writable_data(&mod->core_layout, set_memory_x);
- frob_rodata(&mod->init_layout, set_memory_x);
- frob_writable_data(&mod->init_layout, set_memory_x);
-}
-
/* Iterate through all modules and set each module's text as RW */
void set_all_modules_text_rw(void)
{
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
-
-static void disable_ro_nx(const struct module_layout *layout)
-{
- if (rodata_enabled) {
- frob_text(layout, set_memory_rw);
- frob_rodata(layout, set_memory_rw);
- frob_ro_after_init(layout, set_memory_rw);
- }
- frob_rodata(layout, set_memory_x);
- frob_ro_after_init(layout, set_memory_x);
- frob_writable_data(layout, set_memory_x);
-}
-
#else
-static void disable_ro_nx(const struct module_layout *layout) { }
static void module_enable_nx(const struct module *mod) { }
-static void module_disable_nx(const struct module *mod) { }
#endif
#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
void __weak module_memfree(void *module_region)
{
+ /*
+ * This memory may be RO, and freeing RO memory in an interrupt is not
+ * supported by vmalloc.
+ */
+ WARN_ON(in_interrupt());
vfree(module_region);
}
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */
- disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base);
}
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
- struct rcu_head rcu;
+ struct llist_node node;
void *module_init;
};
-static void do_free_init(struct rcu_head *head)
+static void do_free_init(struct work_struct *w)
{
- struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
- module_memfree(m->module_init);
- kfree(m);
+ struct llist_node *pos, *n, *list;
+ struct mod_initfree *initfree;
+
+ list = llist_del_all(&init_free_list);
+
+ synchronize_rcu();
+
+ llist_for_each_safe(pos, n, list) {
+ initfree = container_of(pos, struct mod_initfree, node);
+ module_memfree(initfree->module_init);
+ kfree(initfree);
+ }
}
+static int __init modules_wq_init(void)
+{
+ INIT_WORK(&init_free_wq, do_free_init);
+ init_llist_head(&init_free_list);
+ return 0;
+}
+module_init(modules_wq_init);
+
/*
* This is where the real work happens.
*
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
- disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
- * path, so use actual RCU here.
+ * path. module_memfree() cannot be called in an interrupt, so do the
+ * work and call synchronize_rcu() in a work queue.
+ *
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
* rcu_barrier()
*/
- call_rcu(&freeinit->rcu, do_free_init);
+ if (llist_add(&freeinit->node, &init_free_list))
+ schedule_work(&init_free_wq);
+
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
- /* we can't deallocate the module until we clear memory protection */
- module_disable_ro(mod);
- module_disable_nx(mod);
-
ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
diff --git a/kernel/panic.c b/kernel/panic.c
index 0ae0d7332f12..c1fcaad337b7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,12 +318,7 @@ void panic(const char *fmt, ...)
}
#endif
#if defined(CONFIG_S390)
- {
- unsigned long caller;
-
- caller = (unsigned long)__builtin_return_address(0);
- disabled_wait(caller);
- }
+ disabled_wait();
#endif
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
local_irq_enable();
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f8fe57d1022e..9bbaaab14b36 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
depends on PM_SLEEP
select HOTPLUG_CPU
+config PM_SLEEP_SMP_NONZERO_CPU
+ def_bool y
+ depends on PM_SLEEP_SMP
+ depends on ARCH_SUSPEND_NONZERO_CPU
+ ---help---
+ If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+ non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+ will allow nohz_full mask to include CPU0.
+
config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cc105ecd9c07..c8c272df7154 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -280,7 +280,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -322,7 +322,7 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Platform_finish:
platform_finish(platform_mode);
@@ -416,7 +416,7 @@ int hibernation_snapshot(int platform_mode)
int __weak hibernate_resume_nonboot_cpu_disable(void)
{
- return disable_nonboot_cpus();
+ return suspend_disable_secondary_cpus();
}
/**
@@ -485,7 +485,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -563,7 +563,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -585,7 +585,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
* safe_copy_page - Copy a page in a safe way.
*
* Check if the page we are going to copy is marked as present in the kernel
- * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
- * and in that case kernel_page_present() always returns 'true').
+ * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
+ * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
+ * always returns 'true'.
*/
static void safe_copy_page(void *dst, struct page *s_page)
{
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index e39059dea38b..ef908c134b34 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -427,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -457,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..4b58c907b4b7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
#define rcu_ftrace_dump_stall_suppress() \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c29761152874..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void)
if (torture_cleanup_begin())
return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
@@ -614,6 +618,7 @@ rcu_perf_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->init)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..efaa5b3f4d3f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,7 +299,6 @@ struct rcu_torture_ops {
int irq_capable;
int can_boost;
int extendables;
- int ext_irq_conflict;
const char *name;
};
@@ -592,12 +591,7 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- static DEFINE_TORTURE_RANDOM(rand);
-
- if (torture_random(&rand) & 0x800)
- cleanup_srcu_struct(&srcu_ctld);
- else
- cleanup_srcu_struct_quiesced(&srcu_ctld);
+ cleanup_srcu_struct(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
@@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long randmask2 = randmask1 >> 3;
WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
- /* Most of the time lots of bits, half the time only one bit. */
+ /* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
@@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
(!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- !(mask & cur_ops->ext_irq_conflict) &&
- (oldmask & cur_ops->ext_irq_conflict))
- mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
return mask ?: RCUTORTURE_RDR_RCU;
}
@@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
rcu_torture_fwd_cb_hist();
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
pr_info("%s: Freed %lu RCU callbacks.\n",
@@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)
cur_ops->cb_barrier();
return;
}
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2267,6 +2261,7 @@ rcu_torture_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5d4a39a6505a..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
- if (quiesced)
- WARN_ON(work_pending(&ssp->srcu_work));
- else
- flush_work(&ssp->srcu_work);
+ flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..9b761e546de8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
return SRCU_INTERVAL;
}
-/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
@@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- if (quiesced) {
- if (WARN_ON(delayed_work_pending(&ssp->work)))
- return; /* Just leak it! */
- } else {
- flush_delayed_work(&ssp->work);
- }
+ flush_delayed_work(&ssp->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- if (quiesced) {
- if (WARN_ON(timer_pending(&sdp->delay_work)))
- return; /* Just leak it! */
- if (WARN_ON(work_pending(&sdp->work)))
- return; /* Just leak it! */
- } else {
- del_timer_sync(&sdp->delay_work);
- flush_work(&sdp->work);
- }
+ del_timer_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
+ if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
+ return; /* Forgot srcu_barrier(), so just leak it! */
}
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(ssp))) {
@@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
free_percpu(ssp->sda);
ssp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 911bd9076d43..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -52,7 +52,7 @@ void rcu_qs(void)
local_irq_save(flags);
if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
- raise_softirq(RCU_SOFTIRQ);
+ raise_softirq_irqoff(RCU_SOFTIRQ);
}
local_irq_restore(flags);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..ec77ec336f58 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-/* Commandeer a sysrq key to dump RCU's tree. */
-static bool sysrq_rcu;
-module_param(sysrq_rcu, bool, 0444);
/*
* The rcu_scheduler_active variable is initialized to the value
@@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-module_param(kthread_prio, int, 0644);
+module_param(kthread_prio, int, 0444);
/* Delay in jiffies for grace-period initialization delays, debug only. */
@@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;
*/
static ulong jiffies_till_sched_qs = ULONG_MAX;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
/*
@@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)
WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
return;
}
+ /* Otherwise, set to third fqs scan, but bound below on large system. */
j = READ_ONCE(jiffies_till_first_fqs) +
2 * READ_ONCE(jiffies_till_next_fqs);
if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
@@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)
}
/*
- * Show the state of the grace-period kthreads.
- */
-void show_rcu_gp_kthreads(void)
-{
- int cpu;
- unsigned long j;
- unsigned long ja;
- unsigned long jr;
- unsigned long jw;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- j = jiffies;
- ja = j - READ_ONCE(rcu_state.gp_activity);
- jr = j - READ_ONCE(rcu_state.gp_req_activity);
- jw = j - READ_ONCE(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
- rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
- ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
- (long)READ_ONCE(rcu_state.gp_seq),
- (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
- READ_ONCE(rcu_state.gp_flags));
- rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
- continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
- (long)rnp->gp_seq_needed);
- if (!rcu_is_leaf_node(rnp))
- continue;
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->gpwrap ||
- ULONG_CMP_GE(rcu_state.gp_seq,
- rdp->gp_seq_needed))
- continue;
- pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)rdp->gp_seq_needed);
- }
- }
- /* sched_show_task(rcu_state.gp_kthread); */
-}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-
-/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
-{
- show_rcu_gp_kthreads();
-}
-
-static struct sysrq_key_op sysrq_rcudump_op = {
- .handler = sysrq_show_rcu,
- .help_msg = "show-rcu(y)",
- .action_msg = "Show RCU tree",
- .enable_mask = SYSRQ_ENABLE_DUMP,
-};
-
-static int __init rcu_sysrq_init(void)
-{
- if (sysrq_rcu)
- return register_sysrq_key('y', &sysrq_rcudump_op);
- return 0;
-}
-early_initcall(rcu_sysrq_init);
-
-/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Handler for the irq_work request posted when a grace period has
- * gone on for too long, but not yet long enough for an RCU CPU
- * stall warning. Set state appropriately, but just complain if
- * there is unexpected state on entry.
- */
-static void rcu_iw_handler(struct irq_work *iwp)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = container_of(iwp, struct rcu_data, rcu_iw);
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp);
- if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
- rdp->rcu_iw_gp_seq = rnp->gp_seq;
- rdp->rcu_iw_pending = false;
- }
- raw_spin_unlock_rcu_node(rnp);
-}
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 0;
}
-static void record_gp_stall_check_time(void)
-{
- unsigned long j = jiffies;
- unsigned long j1;
-
- rcu_state.gp_start = j;
- j1 = rcu_jiffies_till_stall_check();
- /* Record ->gp_start before ->jiffies_stall. */
- smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
- rcu_state.jiffies_resched = j + j1 / 2;
- rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
- struct task_struct *gpk = rcu_state.gp_kthread;
- unsigned long j;
-
- j = jiffies - READ_ONCE(rcu_state.gp_activity);
- if (j > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
- rcu_state.name, j,
- (long)rcu_seq_current(&rcu_state.gp_seq),
- READ_ONCE(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
- if (gpk) {
- pr_err("RCU grace-period kthread stack dump:\n");
- sched_show_task(gpk);
- wake_up_process(gpk);
- }
- }
-}
-
-/*
- * Dump stacks of all tasks running on stalled CPUs. First try using
- * NMIs, but fall back to manual remote stack tracing on architectures
- * that don't support NMI-based stack dumps. The NMI-triggered stack
- * traces are more accurate because they are printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-}
-
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
- unsigned long j;
-
- if (!rcu_kick_kthreads)
- return;
- j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
- if (time_after(jiffies, j) && rcu_state.gp_kthread &&
- (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
- WARN_ONCE(1, "Kicking %s grace-period kthread\n",
- rcu_state.name);
- rcu_ftrace_dump(DUMP_ALL);
- wake_up_process(rcu_state.gp_kthread);
- WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
- }
-}
-
-static void panic_on_rcu_stall(void)
-{
- if (sysctl_panic_on_rcu_stall)
- panic("RCU Stall\n");
-}
-
-static void print_other_cpu_stall(unsigned long gp_seq)
-{
- int cpu;
- unsigned long flags;
- unsigned long gpa;
- unsigned long j;
- int ndetected = 0;
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
- print_cpu_stall_info_begin();
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ndetected += rcu_print_task_stall(rnp);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
- print_cpu_stall_info(cpu);
- ndetected++;
- }
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
- if (ndetected) {
- rcu_dump_cpu_stacks();
-
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall();
- } else {
- if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
- pr_err("INFO: Stall ended before state dump start\n");
- } else {
- j = jiffies;
- gpa = READ_ONCE(rcu_state.gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
- rcu_state.name, j - gpa, j, gpa,
- READ_ONCE(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
- /* In this case, the current CPU might be at fault. */
- sched_show_task(current);
- }
- }
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-
- rcu_check_gp_kthread_starvation();
-
- panic_on_rcu_stall();
-
- rcu_force_quiescent_state(); /* Kick them all. */
-}
-
-static void print_cpu_stall(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
- print_cpu_stall_info_begin();
- raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info(smp_processor_id());
- raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
- jiffies - rcu_state.gp_start,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-
- rcu_check_gp_kthread_starvation();
-
- rcu_dump_cpu_stacks();
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- panic_on_rcu_stall();
-
- /*
- * Attempt to revive the RCU machinery by forcing a context switch.
- *
- * A context switch would normally allow the RCU state machine to make
- * progress and it could be we're stuck in kernel space without context
- * switches for an entirely unreasonable amount of time.
- */
- set_tsk_need_resched(current);
- set_preempt_need_resched();
-}
-
-static void check_cpu_stall(struct rcu_data *rdp)
-{
- unsigned long gs1;
- unsigned long gs2;
- unsigned long gps;
- unsigned long j;
- unsigned long jn;
- unsigned long js;
- struct rcu_node *rnp;
-
- if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
- !rcu_gp_in_progress())
- return;
- rcu_stall_kick_kthreads();
- j = jiffies;
-
- /*
- * Lots of memory barriers to reject false positives.
- *
- * The idea is to pick up rcu_state.gp_seq, then
- * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
- * another copy of rcu_state.gp_seq. These values are updated in
- * the opposite order with memory barriers (or equivalent) during
- * grace-period initialization and cleanup. Now, a false positive
- * can occur if we get an new value of rcu_state.gp_start and a old
- * value of rcu_state.jiffies_stall. But given the memory barriers,
- * the only way that this can happen is if one grace period ends
- * and another starts between these two fetches. This is detected
- * by comparing the second fetch of rcu_state.gp_seq with the
- * previous fetch from rcu_state.gp_seq.
- *
- * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
- * and rcu_state.gp_start suffice to forestall false positives.
- */
- gs1 = READ_ONCE(rcu_state.gp_seq);
- smp_rmb(); /* Pick up ->gp_seq first... */
- js = READ_ONCE(rcu_state.jiffies_stall);
- smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = READ_ONCE(rcu_state.gp_start);
- smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
- gs2 = READ_ONCE(rcu_state.gp_seq);
- if (gs1 != gs2 ||
- ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
- return; /* No stall or GP completed since entering function. */
- rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
- if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall();
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2);
- }
-}
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
-
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
@@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
static void rcu_gp_kthread_wake(void)
{
if ((current == rcu_state.gp_kthread &&
- !in_interrupt() && !in_serving_softirq()) ||
+ !in_irq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) ||
!rcu_state.gp_kthread)
return;
@@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
- rdp->core_needs_qs = false;
-
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
@@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)
}
/*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
+ * Scan the leaf rcu_node structures. For each structure on which all
+ * CPUs have reported a quiescent state and on which there are tasks
+ * blocking the current grace period, initiate RCU priority boosting.
+ * Otherwise, invoke the specified function to check dyntick state for
+ * each CPU that has not yet reported a quiescent state.
*/
static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
@@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
- * This function checks for grace-period requests that fail to motivate
- * RCU to come out of its idle mode.
- */
-void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
- const unsigned long gpssdelay)
-{
- unsigned long flags;
- unsigned long j;
- struct rcu_node *rnp_root = rcu_get_root();
- static atomic_t warned = ATOMIC_INIT(0);
-
- if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
- return;
- j = jiffies; /* Expensive access, and in common case don't get here. */
- if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned))
- return;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- /* Hold onto the leaf lock to make others see warned==1. */
-
- if (rnp_root != rnp)
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
- time_before(j, rcu_state.gp_activity + gpssdelay) ||
- atomic_xchg(&warned, 1)) {
- raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- WARN_ON(1);
- if (rnp_root != rnp)
- raw_spin_unlock_rcu_node(rnp_root);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- show_rcu_gp_kthreads();
-}
-
-/*
- * Do a forward-progress check for rcutorture. This is normally invoked
- * due to an OOM event. The argument "j" gives the time period during
- * which rcutorture would like progress to have been made.
- */
-void rcu_fwd_progress_check(unsigned long j)
-{
- unsigned long cbs;
- int cpu;
- unsigned long max_cbs = 0;
- int max_cpu = -1;
- struct rcu_data *rdp;
-
- if (rcu_gp_in_progress()) {
- pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
- show_rcu_gp_kthreads();
- } else {
- pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
- preempt_disable();
- rdp = this_cpu_ptr(&rcu_data);
- rcu_check_gp_start_stall(rdp->mynode, rdp, j);
- preempt_enable();
- }
- for_each_possible_cpu(cpu) {
- cbs = rcu_get_n_cbs_cpu(cpu);
- if (!cbs)
- continue;
- if (max_cpu < 0)
- pr_info("%s: callbacks", __func__);
- pr_cont(" %d: %lu", cpu, cbs);
- if (cbs <= max_cbs)
- continue;
- max_cbs = cbs;
- max_cpu = cpu;
- }
- if (max_cpu >= 0)
- pr_cont("\n");
-}
-EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
-
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(struct softirq_action *unused)
{
@@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedite_gp();
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)
jiffies_till_first_fqs = d;
if (jiffies_till_next_fqs == ULONG_MAX)
jiffies_till_next_fqs = d;
- if (jiffies_till_sched_qs == ULONG_MAX)
- adjust_jiffies_till_sched_qs();
+ adjust_jiffies_till_sched_qs();
/* If the compile-time values are accurate, just leave. */
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
@@ -3858,5 +3377,6 @@ void __init rcu_init(void)
srcu_init();
}
+#include "tree_stall.h"
#include "tree_exp.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bb4f995f2d3f..e253d11af3c4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
int rcu_dynticks_snap(struct rcu_data *rdp);
-/* Forward declarations for rcutree_plugin.h */
+/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(void);
-static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
@@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(int cpu);
-static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
@@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
+
+/* Forward declarations for tree_stall.h */
+static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
+static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..9c990df880d1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -10,6 +10,7 @@
#include <linux/lockdep.h>
static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
rdp->deferred_qs = true;
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+ t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
@@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)
*
* If the CPU is fully enabled (or if some buggy RCU-preempt
* read-side critical section is being used from idle), just
- * invoke rcu_preempt_defer_qs() to immediately report the
+ * invoke rcu_preempt_deferred_qs() to immediately report the
* quiescent state. We cannot use rcu_read_unlock_special()
* because we are in an interrupt handler, which will cause that
* function to take an early exit without doing anything.
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)
{
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rnp->exp_tasks)
+ return 0;
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ return ndetected;
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)
WARN_ON_ONCE(ret);
}
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..1102765f91fd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -285,7 +285,7 @@ static void rcu_qs(void)
TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
- current->rcu_read_unlock_special.b.need_qs = false;
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
}
}
@@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
- unsigned long flags;
- struct task_struct *t;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- /*
- * We could be printing a lot while holding a spinlock.
- * Avoid triggering hard lockup.
- */
- touch_nmi_watchdog();
- sched_show_task(t);
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
- struct rcu_node *rnp = rcu_get_root();
-
- rcu_print_detail_task_stall_rnp(rnp);
- rcu_for_each_leaf_node(rnp)
- rcu_print_detail_task_stall_rnp(rnp);
-}
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
- pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
- rnp->level, rnp->grplo, rnp->grphi);
-}
-
-static void rcu_print_task_stall_end(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return 0;
- rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- rcu_print_task_stall_end();
- return ndetected;
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each that is blocking the current
- * expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rnp->exp_tasks)
- return 0;
- t = list_entry(rnp->exp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- return ndetected;
-}
-
-/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
@@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)
/*
* Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
+ * critical section, clean up if so. No need to issue warnings, as
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort. Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
*/
void exit_rcu(void)
{
struct task_struct *t = current;
- if (likely(list_empty(&current->rcu_node_entry)))
+ if (unlikely(!list_empty(&current->rcu_node_entry))) {
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
+ } else if (unlikely(t->rcu_read_lock_nesting)) {
+ t->rcu_read_lock_nesting = 1;
+ } else {
return;
- t->rcu_read_lock_nesting = 1;
- barrier();
- t->rcu_read_unlock_special.b.blocked = true;
+ }
__rcu_read_unlock();
rcu_preempt_deferred_qs(current);
}
@@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static void rcu_preempt_deferred_qs(struct task_struct *t) { }
/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections that are
- * blocking the current expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
@@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- struct task_struct *t;
-
raw_lockdep_assert_held_rcu_node(rnp);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- t = rnp->boost_kthread_task;
- if (t)
- rcu_wake_cond(t, rnp->boost_kthread_status);
+ rcu_wake_cond(rnp->boost_kthread_task,
+ rnp->boost_kthread_status);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ".l"[rdp->all_lazy],
- ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
- ".D"[!rdp->tick_nohz_enabled_snap]);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period, then
- * print the number of scheduling clock interrupts the CPU has taken
- * during the time that it has been aware. Otherwise, print the number
- * of RCU grace periods that this CPU is ignorant of, for example, "1"
- * if the CPU was aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(int cpu)
-{
- unsigned long delta;
- char fast_no_hz[72];
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- char *ticks_title;
- unsigned long ticks_value;
-
- /*
- * We could be printing a lot while holding a spinlock. Avoid
- * triggering hard lockup.
- */
- touch_nmi_watchdog();
-
- ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
- if (ticks_value) {
- ticks_title = "GPs behind";
- } else {
- ticks_title = "ticks this GP";
- ticks_value = rdp->ticks_this_gp;
- }
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
- cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
- "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
- !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
- rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
- "!."[!delta],
- ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
- rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz);
-}
-
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
- pr_err("\t");
-}
-
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
- rdp->ticks_this_gp = 0;
- rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
- WRITE_ONCE(rdp->last_fqs_resched, jiffies);
-}
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
@@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
*/
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- cpulist_parse(str, rcu_nocb_mask);
+ if (!strcasecmp(str, "all"))
+ cpumask_setall(rcu_nocb_mask);
+ else
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
return 1;
}
__setup("rcu_nocbs=", rcu_nocb_setup);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..f65a73a97323
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
+
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+/* Limit-check stall timeouts specified at boottime and runtime. */
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+/* Don't print RCU CPU stall warnings during a kernel panic. */
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ rcu_state.gp_start = j;
+ j1 = rcu_jiffies_till_stall_check();
+ /* Record ->gp_start before ->jiffies_stall. */
+ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+ rcu_state.jiffies_resched = j + j1 / 2;
+ rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+ unsigned long j;
+
+ if (!rcu_kick_kthreads)
+ return;
+ j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+ (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+ rcu_state.name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rcu_state.gp_kthread);
+ WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+ }
+}
+
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling. Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gp_seq = rnp->gp_seq;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+
+#ifdef CONFIG_PREEMPT
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return 0;
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ pr_cont("\n");
+ return ndetected;
+}
+
+#else /* #ifdef CONFIG_PREEMPT */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT */
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+ sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+ rdp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ".l"[rdp->all_lazy],
+ ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+ ".D"[!!rdp->tick_nohz_enabled_snap]);
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ *cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware. Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+ unsigned long delta;
+ char fast_no_hz[72];
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ char *ticks_title;
+ unsigned long ticks_value;
+
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
+ ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+ if (ticks_value) {
+ ticks_title = "GPs behind";
+ } else {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ }
+ print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+ delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+ cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+ "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
+ ticks_value, ticks_title,
+ rcu_dynticks_snap(rdp) & 0xfff,
+ rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+ fast_no_hz);
+}
+
+/* Complain about starvation of grace-period kthread. */
+static void rcu_check_gp_kthread_starvation(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ unsigned long j;
+
+ j = jiffies - READ_ONCE(rcu_state.gp_activity);
+ if (j > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ rcu_state.name, j,
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ READ_ONCE(rcu_state.gp_flags),
+ gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+ gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ if (gpk) {
+ pr_err("RCU grace-period kthread stack dump:\n");
+ sched_show_task(gpk);
+ wake_up_process(gpk);
+ }
+ }
+}
+
+static void print_other_cpu_stall(unsigned long gp_seq)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
+ int ndetected = 0;
+ struct rcu_node *rnp;
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(cpu);
+ ndetected++;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ if (ndetected) {
+ rcu_dump_cpu_stacks();
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_for_each_leaf_node(rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+ } else {
+ if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = READ_ONCE(rcu_state.gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rcu_state.name, j - gpa, j, gpa,
+ READ_ONCE(jiffies_till_next_fqs),
+ rcu_get_root()->qsmask);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+
+ rcu_check_gp_kthread_starvation();
+
+ panic_on_rcu_stall();
+
+ rcu_force_quiescent_state(); /* Kick them all. */
+}
+
+static void print_cpu_stall(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rcu_get_root();
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+ print_cpu_stall_info(smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ jiffies - rcu_state.gp_start,
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+
+ rcu_check_gp_kthread_starvation();
+
+ rcu_dump_cpu_stacks();
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ panic_on_rcu_stall();
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+ unsigned long gs1;
+ unsigned long gs2;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long jn;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ !rcu_gp_in_progress())
+ return;
+ rcu_stall_kick_kthreads();
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rcu_state.gp_seq, then
+ * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+ * another copy of rcu_state.gp_seq. These values are updated in
+ * the opposite order with memory barriers (or equivalent) during
+ * grace-period initialization and cleanup. Now, a false positive
+ * can occur if we get an new value of rcu_state.gp_start and a old
+ * value of rcu_state.jiffies_stall. But given the memory barriers,
+ * the only way that this can happen is if one grace period ends
+ * and another starts between these two fetches. This is detected
+ * by comparing the second fetch of rcu_state.gp_seq with the
+ * previous fetch from rcu_state.gp_seq.
+ *
+ * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+ * and rcu_state.gp_start suffice to forestall false positives.
+ */
+ gs1 = READ_ONCE(rcu_state.gp_seq);
+ smp_rmb(); /* Pick up ->gp_seq first... */
+ js = READ_ONCE(rcu_state.jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = READ_ONCE(rcu_state.gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+ gs2 = READ_ONCE(rcu_state.gp_seq);
+ if (gs1 != gs2 ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ if (rcu_gp_in_progress() &&
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall();
+
+ } else if (rcu_gp_in_progress() &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including of callback invocation.
+
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ int cpu;
+ unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long jw;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ j = jiffies;
+ ja = j - READ_ONCE(rcu_state.gp_activity);
+ jr = j - READ_ONCE(rcu_state.gp_req_activity);
+ jw = j - READ_ONCE(rcu_state.gp_wake_time);
+ pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ rcu_state.name, gp_state_getname(rcu_state.gp_state),
+ rcu_state.gp_state,
+ rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+ (long)READ_ONCE(rcu_state.gp_seq),
+ (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+ READ_ONCE(rcu_state.gp_flags));
+ rcu_for_each_node_breadth_first(rnp) {
+ if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+ continue;
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+ rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+ (long)rnp->gp_seq_needed);
+ if (!rcu_is_leaf_node(rnp))
+ continue;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->gpwrap ||
+ ULONG_CMP_GE(rcu_state.gp_seq,
+ rdp->gp_seq_needed))
+ continue;
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)rdp->gp_seq_needed);
+ }
+ }
+ /* sched_show_task(rcu_state.gp_kthread); */
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay)
+{
+ unsigned long flags;
+ unsigned long j;
+ struct rcu_node *rnp_root = rcu_get_root();
+ static atomic_t warned = ATOMIC_INIT(0);
+
+ if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+ return;
+ j = jiffies; /* Expensive access, and in common case don't get here. */
+ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned))
+ return;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ /* Hold onto the leaf lock to make others see warned==1. */
+
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
+ time_before(j, rcu_state.gp_activity + gpssdelay) ||
+ atomic_xchg(&warned, 1)) {
+ raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ WARN_ON(1);
+ if (rnp_root != rnp)
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
+}
+
+/*
+ * Do a forward-progress check for rcutorture. This is normally invoked
+ * due to an OOM event. The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+ unsigned long cbs;
+ int cpu;
+ unsigned long max_cbs = 0;
+ int max_cpu = -1;
+ struct rcu_data *rdp;
+
+ if (rcu_gp_in_progress()) {
+ pr_info("%s: GP age %lu jiffies\n",
+ __func__, jiffies - rcu_state.gp_start);
+ show_rcu_gp_kthreads();
+ } else {
+ pr_info("%s: Last GP end %lu jiffies ago\n",
+ __func__, jiffies - rcu_state.gp_end);
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_data);
+ rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+ preempt_enable();
+ }
+ for_each_possible_cpu(cpu) {
+ cbs = rcu_get_n_cbs_cpu(cpu);
+ if (!cbs)
+ continue;
+ if (max_cpu < 0)
+ pr_info("%s: callbacks", __func__);
+ pr_cont(" %d: %lu", cpu, cbs);
+ if (cbs <= max_cbs)
+ continue;
+ max_cbs = cbs;
+ max_cpu = cpu;
+ }
+ if (max_cpu >= 0)
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index cbaa976c5945..c3bf44ba42e5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
#ifdef CONFIG_RCU_STALL_COMMON
-
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
-#endif
-
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
-
-int rcu_jiffies_till_stall_check(void)
-{
- int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
-
- /*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
- */
- if (till_stall_check < 3) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 3);
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 300);
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-
-void rcu_sysrq_start(void)
-{
- if (!rcu_cpu_stall_suppress)
- rcu_cpu_stall_suppress = 2;
-}
-
-void rcu_sysrq_end(void)
-{
- if (rcu_cpu_stall_suppress == 2)
- rcu_cpu_stall_suppress = 0;
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static int __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
- return 0;
-}
-early_initcall(check_cpu_stall_init);
-
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/resource.c b/kernel/resource.c
index 92190f62ebc5..8c15f846e8ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- resource_size_t end = start + size - 1;
+ struct resource res;
int type = 0; int other = 0;
struct resource *p;
+ res.start = start;
+ res.end = start + size - 1;
+
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
bool is_type = (((p->flags & flags) == flags) &&
((desc == IORES_DESC_NONE) ||
(desc == p->desc)));
- if (start >= p->start && start <= p->end)
- is_type ? type++ : other++;
- if (end >= p->start && end <= p->end)
- is_type ? type++ : other++;
- if (p->start >= start && p->end <= end)
+ if (resource_overlaps(p, &res))
is_type ? type++ : other++;
}
read_unlock(&resource_lock);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..9424ee90589e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* - signal delivery,
* and return to user-space.
*
- * This is how we can ensure that the entire rseq critical section,
- * consisting of both the C part and the assembly instruction sequence,
+ * This is how we can ensure that the entire rseq critical section
* will issue the commit instruction only if executed atomically with
* respect to other threads scheduled on the same CPU, and with respect
* to signal handlers.
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (current->rseq_len != rseq_len)
+ if (rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (ret)
return ret;
current->rseq = NULL;
- current->rseq_len = 0;
current->rseq_sig = 0;
return 0;
}
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || current->rseq_len != rseq_len)
+ if (current->rseq != rseq || rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
- current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778c48a7fda..102dfcf0a29a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
+
+ p->on_rq = TASK_ON_RQ_QUEUED;
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
/*
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
@@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
return 0;
} else if (task_on_rq_queued(p)) {
/*
@@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
@@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
__schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
- activate_task(rq, p, en_flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
-
- /* If a worker is waking up, notify the workqueue: */
- if (p->flags & PF_WQ_WORKER)
- wq_worker_waking_up(p, cpu_of(rq));
-}
-
/*
* Mark the task runnable and perform wakeup-preemption.
*/
@@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
en_flags |= ENQUEUE_MIGRATED;
#endif
- ttwu_activate(rq, p, en_flags);
+ activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
}
@@ -2107,56 +2098,6 @@ out:
}
/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @rf: request-queue flags for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
-{
- struct rq *rq = task_rq(p);
-
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
- return;
-
- lockdep_assert_held(&rq->lock);
-
- if (!raw_spin_trylock(&p->pi_lock)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we've
- * not yet picked a replacement task.
- */
- rq_unlock(rq, rf);
- raw_spin_lock(&p->pi_lock);
- rq_relock(rq, rf);
- }
-
- if (!(p->state & TASK_NORMAL))
- goto out;
-
- trace_sched_waking(p);
-
- if (!task_on_rq_queued(p)) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&rq->nr_iowait);
- }
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
- }
-
- ttwu_do_wakeup(rq, p, 0, rf);
- ttwu_stat(p, smp_processor_id(), 0);
-out:
- raw_spin_unlock(&p->pi_lock);
-}
-
-/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
- prev->on_rq = 0;
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
-
- /*
- * If a worker went to sleep, notify and ask workqueue
- * whether it wants to wake up a task to maintain
- * concurrency.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
- to_wakeup = wq_worker_sleeping(prev);
- if (to_wakeup)
- try_to_wake_up_local(to_wakeup, &rf);
- }
}
switch_count = &prev->nvcsw;
}
@@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
+
+ /*
+ * If a worker went to sleep, notify and ask workqueue whether
+ * it wants to wake up a task to maintain concurrency.
+ * As this function is called inside the schedule() context,
+ * we disable preemption to avoid it calling schedule() again
+ * in the possible wakeup of a kworker.
+ */
+ if (tsk->flags & PF_WQ_WORKER) {
+ preempt_disable();
+ wq_worker_sleeping(tsk);
+ preempt_enable_no_resched();
+ }
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
+static void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+}
+
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)
__schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
+ sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -5918,7 +5865,7 @@ void __init sched_init_smp(void)
static int __init migration_init(void)
{
- sched_rq_cpu_starting(smp_processor_id());
+ sched_cpu_starting(smp_processor_id());
return 0;
}
early_initcall(migration_init);
@@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
@@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -6654,20 +6603,22 @@ out_unlock:
return ret;
}
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
u64 quota, period;
period = ktime_to_ns(tg->cfs_bandwidth.period);
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
- else
+ else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+ else
+ return -EINVAL;
return tg_set_cfs_bandwidth(tg, period, quota);
}
-long tg_get_cfs_quota(struct task_group *tg)
+static long tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
@@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
u64 quota, period;
+ if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
return tg_set_cfs_bandwidth(tg, period, quota);
}
-long tg_get_cfs_period(struct task_group *tg)
+static long tg_get_cfs_period(struct task_group *tg)
{
u64 cfs_period_us;
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 835671f0f917..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
*/
#include "sched.h"
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
* cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b3a878aa593d..5403479073b0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -773,6 +773,7 @@ out:
return 0;
fail:
+ kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8039d62ae36e..678bfb9bd87f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -702,7 +702,7 @@ do { \
static const char *sched_tunable_scaling_names[] = {
"none",
- "logaritmic",
+ "logarithmic",
"linear"
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 35f3ea375084..f35930f5e528 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2597,7 +2597,7 @@ out:
/*
* Drive the periodic memory faults..
*/
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
@@ -3571,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
*/
-void sync_entity_load_avg(struct sched_entity *se)
+static void sync_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
@@ -3584,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
@@ -5145,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
-static unsigned long capacity_of(int cpu);
static inline bool cpu_overutilized(int cpu)
{
@@ -7521,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -7657,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -9551,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * anywhere yet.
*/
static inline int find_new_ilb(void)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int ilb;
- if (ilb < nr_cpu_ids && idle_cpu(ilb))
- return ilb;
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_FLAG_MISC)) {
+ if (idle_cpu(ilb))
+ return ilb;
+ }
return nr_cpu_ids;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b02d148e7672..687302051a27 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
+ cpumask_var_t tmp;
int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
return 0;
}
+ alloc_bootmem_cpumask_var(&tmp);
if (!housekeeping_flags) {
alloc_bootmem_cpumask_var(&housekeeping_mask);
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
- if (cpumask_empty(housekeeping_mask))
+
+ cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+ if (cpumask_empty(tmp)) {
+ pr_warn("Housekeeping: must include one present CPU, "
+ "using boot CPU:%d\n", smp_processor_id());
__cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ }
} else {
- cpumask_var_t tmp;
-
- alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+ if (cpumask_empty(tmp))
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
if (!cpumask_equal(tmp, housekeeping_mask)) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
- free_bootmem_cpumask_var(tmp);
}
+ free_bootmem_cpumask_var(tmp);
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 90fa23d36565..1e6b909dca36 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF;
+ else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
+ if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..b52ed1ada0be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
*/
- struct perf_domain *pd;
+ struct perf_domain __rcu *pd;
};
extern struct root_domain def_root_domain;
@@ -869,8 +869,8 @@ struct rq {
atomic_t nr_iowait;
#ifdef CONFIG_SMP
- struct root_domain *rd;
- struct sched_domain *sd;
+ struct root_domain *rd;
+ struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd;
}
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity {
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ab7f371a3a17..f53f89df837d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
* the cpumask of the domain), this allows us to quickly tell if
* two CPUs are in the same cache domain, see cpus_share_cache().
*/
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
struct sched_group *sg;
+ bool already_visited;
if (child)
cpu = cpumask_first(sched_domain_span(child));
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg = *per_cpu_ptr(sdd->sg, cpu);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
- /* For claim_allocations: */
- atomic_inc(&sg->ref);
- atomic_inc(&sg->sgc->ref);
+ /* Increase refcounts for claim_allocations: */
+ already_visited = atomic_inc_return(&sg->ref) > 1;
+ /* sgc visits should follow a similar trend as sg */
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+
+ /* If we have already visited that group, it's already initialized. */
+ if (already_visited)
+ return sg;
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
/*
* build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
+ * covered by the given span, will set each group's ->cpumask correctly,
+ * and will initialize their ->sgc.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
}
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated CPUs, but could be used to
- * exclude other special cases in the future.
+ * Set up scheduler domains and groups. For now this just excludes isolated
+ * CPUs, but could be used to exclude other special cases in the future.
*/
int sched_init_domains(const struct cpumask *cpu_map)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index df27e499956a..3582eeb59893 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -502,7 +502,10 @@ out:
*
* Caller must be holding current->sighand->siglock lock.
*
- * Returns 0 on success, -ve on error.
+ * Returns 0 on success, -ve on error, or
+ * - in TSYNC mode: the pid of a thread which was either not in the correct
+ * seccomp mode or did not have an ancestral seccomp filter
+ * - in NEW_LISTENER mode: the fd of the new listener
*/
static long seccomp_attach_filter(unsigned int flags,
struct seccomp_filter *filter)
@@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
+ /*
+ * In the successful case, NEW_LISTENER returns the new listener fd.
+ * But in the failure case, TSYNC returns the thread that died. If you
+ * combine these two flags, there's no way to tell whether something
+ * succeeded or failed. So, let's disallow this combination.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
@@ -1304,7 +1317,7 @@ out:
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
- if (ret < 0) {
+ if (ret) {
listener_f->private_data = NULL;
fput(listener_f);
put_unused_fd(listener);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 10277429ed84..2c3382378d94 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)
}
EXPORT_SYMBOL(tasklet_kill);
-/*
- * tasklet_hrtimer
- */
-
-/*
- * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * hrtimer callback, but from softirq context.
- */
-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
-{
- struct tasklet_hrtimer *ttimer =
- container_of(timer, struct tasklet_hrtimer, timer);
-
- tasklet_hi_schedule(&ttimer->tasklet);
- return HRTIMER_NORESTART;
-}
-
-/*
- * Helper function which calls the hrtimer callback from
- * tasklet/softirq context
- */
-static void __tasklet_hrtimer_trampoline(unsigned long data)
-{
- struct tasklet_hrtimer *ttimer = (void *)data;
- enum hrtimer_restart restart;
-
- restart = ttimer->function(&ttimer->timer);
- if (restart != HRTIMER_NORESTART)
- hrtimer_restart(&ttimer->timer);
-}
-
-/**
- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
- * @ttimer: tasklet_hrtimer which is initialized
- * @function: hrtimer callback function which gets called from softirq context
- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
- * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
- */
-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
- enum hrtimer_restart (*function)(struct hrtimer *),
- clockid_t which_clock, enum hrtimer_mode mode)
-{
- hrtimer_init(&ttimer->timer, which_clock, mode);
- ttimer->timer.function = __hrtimer_tasklet_trampoline;
- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
- (unsigned long)ttimer);
- ttimer->function = function;
-}
-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
-
void __init softirq_init(void)
{
int cpu;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..27bafc1e271e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -5,41 +5,56 @@
*
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+ int spaces)
{
- int i;
+ unsigned int i;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return;
- for (i = 0; i < trace->nr_entries; i++)
- printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+ for (i = 0; i < nr_entries; i++)
+ printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
}
-EXPORT_SYMBOL_GPL(print_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_print);
-int snprint_stack_trace(char *buf, size_t size,
- struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf: Pointer to the print buffer
+ * @size: Size of the print buffer
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+ unsigned int nr_entries, int spaces)
{
- int i;
- int generated;
- int total = 0;
+ unsigned int generated, i, total = 0;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return 0;
- for (i = 0; i < trace->nr_entries; i++) {
+ for (i = 0; i < nr_entries && size; i++) {
generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
- (void *)trace->entries[i]);
+ (void *)entries[i]);
total += generated;
-
- /* Assume that generated isn't a negative number */
if (generated >= size) {
buf += size;
size = 0;
@@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size,
return total;
}
-EXPORT_SYMBOL_GPL(snprint_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_snprint);
+
+#ifdef CONFIG_ARCH_STACKWALK
+
+struct stacktrace_cookie {
+ unsigned long *store;
+ unsigned int size;
+ unsigned int skip;
+ unsigned int len;
+};
+
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ struct stacktrace_cookie *c = cookie;
+
+ if (c->len >= c->size)
+ return false;
+
+ if (c->skip > 0) {
+ c->skip--;
+ return true;
+ }
+ c->store[c->len++] = addr;
+ return c->len < c->size;
+}
+
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ if (in_sched_functions(addr))
+ return true;
+ return stack_trace_consume_entry(cookie, addr, reliable);
+}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, NULL);
+ return c.len;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ arch_stack_walk(consume_entry, &c, tsk, NULL);
+ put_task_stack(tsk);
+ return c.len;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, regs);
+ return c.len;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+ int ret;
+
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
+ put_task_stack(tsk);
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+
+ /* Trace user stack if not a kernel thread */
+ if (!current->mm)
+ return 0;
+
+ arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+ return c.len;
+}
+#endif
+
+#else /* CONFIG_ARCH_STACKWALK */
/*
* Architectures that do not implement save_stack_trace_*()
@@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
return -ENOSYS;
}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace(&trace);
+ return trace.nr_entries;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+ unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace_tsk(task, &trace);
+ return trace.nr_entries;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr,
+ };
+
+ save_stack_trace_regs(regs, &trace);
+ return trace.nr_entries;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+ int ret = save_stack_trace_tsk_reliable(tsk, &trace);
+
+ return ret ? ret : trace.nr_entries;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+
+ save_stack_trace_user(&trace);
+ return trace.nr_entries;
+}
+#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
+
+#endif /* !CONFIG_ARCH_STACKWALK */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e77662dd2d9..f5490222e134 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -611,6 +611,22 @@ void clockevents_resume(void)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+/**
+ * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * @cpu: The outgoing CPU
+ *
+ * Called on the outgoing CPU after it took itself offline.
+ */
+void tick_offline_cpu(unsigned int cpu)
+{
+ raw_spin_lock(&clockevents_lock);
+ tick_broadcast_offline(cpu);
+ raw_spin_unlock(&clockevents_lock);
+}
+# endif
+
/**
* tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)
raw_spin_lock_irqsave(&clockevents_lock, flags);
- tick_shutdown_broadcast_oneshot(cpu);
- tick_shutdown_broadcast(cpu);
tick_shutdown(cpu);
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index ac9c03dd6c7d..d23b434c2ca7 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
- unsigned long seq;
+ unsigned int seq;
u64 ret;
do {
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 930113b9799a..968e4b07918e 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
u64 cyc, res;
- unsigned long seq;
+ unsigned int seq;
struct clock_read_data *rd;
do {
@@ -267,7 +267,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned long seq = raw_read_seqcount(&cd.seq);
+ unsigned int seq = raw_read_seqcount(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ee834d4fb814..e51778c312f1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+# ifdef CONFIG_HOTPLUG_CPU
+static void tick_broadcast_oneshot_offline(unsigned int cpu);
+# endif
#else
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+# ifdef CONFIG_HOTPLUG_CPU
+static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
+# endif
#endif
/*
@@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Remove a CPU from broadcasting
- */
-void tick_shutdown_broadcast(unsigned int cpu)
+static void tick_shutdown_broadcast(void)
{
- struct clock_event_device *bc;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- bc = tick_broadcast_device.evtdev;
- cpumask_clear_cpu(cpu, tick_broadcast_mask);
- cpumask_clear_cpu(cpu, tick_broadcast_on);
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
if (bc && cpumask_empty(tick_broadcast_mask))
clockevents_shutdown(bc);
}
+}
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_broadcast_offline(unsigned int cpu)
+{
+ raw_spin_lock(&tick_broadcast_lock);
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+ tick_broadcast_oneshot_offline(cpu);
+ tick_shutdown_broadcast();
+ raw_spin_unlock(&tick_broadcast_lock);
}
+
#endif
void tick_suspend_broadcast(void)
@@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
* either the CPU handling the broadcast
* interrupt or we got woken by something else.
*
- * We are not longer in the broadcast mask, so
+ * We are no longer in the broadcast mask, so
* if the cpu local expiry time is already
* reached, we would reprogram the cpu local
* timer with an already expired event.
*
* This can lead to a ping-pong when we return
- * to idle and therefor rearm the broadcast
+ * to idle and therefore rearm the broadcast
* timer before the cpu local timer was able
* to fire. This happens because the forced
* reprogramming makes sure that the event
@@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
}
/*
- * Remove a dead CPU from broadcasting
+ * Remove a dying CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int cpu)
+static void tick_broadcast_oneshot_offline(unsigned int cpu)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
/*
* Clear the broadcast masks for the dead cpu, but do not stop
* the broadcast device!
@@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df401463a191..59225b484e4e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
* procedure also covers cpu hotplug.
*/
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
/*
* Debugging: see timer_list.c
@@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
!tick_broadcast_oneshot_active()) {
clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
- unsigned long seq;
+ unsigned int seq;
ktime_t next;
do {
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+ int cpu = *(unsigned int *)info;
+
+ WARN_ON(tick_do_timer_cpu != smp_processor_id());
+
+ tick_do_timer_cpu = cpu;
+}
+
+static void tick_take_do_timer_from_boot(void)
+{
+ int cpu = smp_processor_id();
+ int from = tick_do_timer_boot_cpu;
+
+ if (from >= 0 && from != cpu)
+ smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
+
/*
* Setup the tick device
*/
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- if (!tick_nohz_full_cpu(cpu))
- tick_do_timer_cpu = cpu;
- else
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ tick_do_timer_cpu = cpu;
+
tick_next_period = ktime_get();
tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+ /*
+ * The boot CPU may be nohz_full, in which case set
+ * tick_do_timer_boot_cpu so the first housekeeping
+ * secondary that comes up will take do_timer from
+ * us.
+ */
+ if (tick_nohz_full_cpu(cpu))
+ tick_do_timer_boot_cpu = cpu;
+
+ } else if (tick_do_timer_boot_cpu != -1 &&
+ !tick_nohz_full_cpu(cpu)) {
+ tick_take_do_timer_from_boot();
+ tick_do_timer_boot_cpu = -1;
+ WARN_ON(tick_do_timer_cpu != cpu);
+#endif
}
/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..7b2496136729 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
extern bool tick_resume_check_broadcast(void);
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
static inline void tick_suspend_broadcast(void) { }
static inline void tick_resume_broadcast(void) { }
static inline bool tick_resume_check_broadcast(void) { return false; }
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
/* Functions related to oneshot broadcasting */
#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
#else /* !(BROADCAST && ONESHOT): */
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
#endif /* !(BROADCAST && ONESHOT) */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
+extern void tick_broadcast_offline(unsigned int cpu);
+#else
+static inline void tick_broadcast_offline(unsigned int cpu) { }
+#endif
+
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8d18e03124ff..f4ee1a3428ae 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
+ *
+ * If nohz_full is enabled, this should not happen because the
+ * tick_do_timer_cpu never relinquishes.
*/
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
- && !tick_nohz_full_cpu(cpu))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+#ifdef CONFIG_NO_HZ_FULL
+ WARN_ON(tick_nohz_full_running);
+#endif
tick_do_timer_cpu = cpu;
+ }
#endif
/* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
static int tick_nohz_cpu_down(unsigned int cpu)
{
/*
- * The boot CPU handles housekeeping duty (unbound timers,
- * workqueues, timekeeping, ...) on behalf of full dynticks
+ * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+ * timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
return;
}
- cpu = smp_processor_id();
+ if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
+ !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
+ cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
- cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warn("NO_HZ: Clearing %d from nohz_full range "
+ "for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
}
for_each_cpu(cpu, tick_nohz_full_mask)
@@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
- unsigned long seq, basejiff;
+ unsigned long basejiff;
+ unsigned int seq;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
/*
* Boot safety: make sure the timekeeping duty has been
* assigned before entering dyntick-idle mode,
+ * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
*/
- if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+ return false;
+
+ /* Should not happen for nohz-full */
+ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
return false;
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 6de959a854b2..4fb06527cf64 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -24,12 +24,19 @@ enum tick_nohz_mode {
* struct tick_sched - sched tick emulation and no idle tick control/stats
* @sched_timer: hrtimer to schedule the periodic tick in high
* resolution mode
+ * @check_clocks: Notification mechanism about clocksource changes
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @inidle: Indicator that the CPU is in the tick idle mode
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_active: Indicator that the CPU is actively in the tick idle mode;
+ * it is resetted during irq handling phases.
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @got_idle_tick: Tick timer function has run with @inidle set
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
- * @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
@@ -40,8 +47,8 @@ enum tick_nohz_mode {
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
* @timer_expires_base: Base time clock monotonic for @timer_expires
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
- * @got_idle_tick: Tick timer function has run with @inidle set
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
*/
struct tick_sched {
struct hrtimer sched_timer;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index c3f756f8534b..86656bbac232 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
static int firsttime = 1;
int error = 0;
- if (tv && !timespec64_valid(tv))
+ if (tv && !timespec64_valid_settod(tv))
return -EINVAL;
error = security_settime64(tv, tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f986e1918d12..5716e28bfa3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
void ktime_get_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
ktime_t *offset = offsets[offs];
- unsigned long seq;
+ unsigned int seq;
ktime_t tconv;
do {
@@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void)
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
ktime_t base_raw;
ktime_t base_real;
u64 nsec_raw;
@@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
ktime_t base_real, base_raw;
u64 nsec_real, nsec_raw;
u8 cs_was_changed_seq;
- unsigned long seq;
+ unsigned int seq;
bool do_interp;
int ret;
@@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts)
unsigned long flags;
int ret = 0;
- if (!timespec64_valid_strict(ts))
+ if (!timespec64_valid_settod(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), *ts);
if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
- !timespec64_valid_strict(&tmp)) {
+ !timespec64_valid_settod(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock)
void ktime_get_raw_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 nsecs;
do {
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);
int timekeeping_valid_for_hres(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
int ret;
do {
@@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void)
u64 timekeeping_max_deferment(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 ret;
do {
@@ -1527,7 +1527,7 @@ void __init timekeeping_init(void)
unsigned long flags;
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
- if (timespec64_valid_strict(&wall_time) &&
+ if (timespec64_valid_settod(&wall_time) &&
timespec64_to_ns(&wall_time) > 0) {
persistent_clock_exists = true;
} else if (timespec64_to_ns(&wall_time) != 0) {
@@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64);
void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
- unsigned long seq;
+ unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..a9b1bbc2d88d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
hlist_add_head(&timer->entry, base->vectors + idx);
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
}
static void
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)
trace_timer_init(timer);
}
-static inline void
-debug_activate(struct timer_list *timer, unsigned long expires)
-{
- debug_timer_activate(timer);
- trace_timer_start(timer, expires, timer->flags);
-}
-
static inline void debug_deactivate(struct timer_list *timer)
{
debug_timer_deactivate(timer);
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
}
- debug_activate(timer, expires);
+ debug_timer_activate(timer);
timer->expires = expires;
/*
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
forward_timer_base(base);
- debug_activate(timer, timer->expires);
+ debug_timer_activate(timer);
internal_add_timer(base, timer);
raw_spin_unlock_irqrestore(&base->lock, flags);
}
@@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
+static void call_timer_fn(struct timer_list *timer,
+ void (*fn)(struct timer_list *),
+ unsigned long baseclk)
{
int count = preempt_count();
@@ -1321,7 +1318,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
*/
lock_map_acquire(&lockdep_map);
- trace_timer_expire_entry(timer);
+ trace_timer_expire_entry(timer, baseclk);
fn(timer);
trace_timer_expire_exit(timer);
@@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
+ /*
+ * This value is required only for tracing. base->clk was
+ * incremented directly before expire_timers was called. But expiry
+ * is related to the old base->clk value.
+ */
+ unsigned long baseclk = base->clk - 1;
+
while (!hlist_empty(head)) {
struct timer_list *timer;
void (*fn)(struct timer_list *);
@@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
- call_timer_fn(timer, fn);
+ call_timer_fn(timer, fn, baseclk);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn);
+ call_timer_fn(timer, fn, baseclk);
raw_spin_lock_irq(&base->lock);
}
}
diff --git a/kernel/torture.c b/kernel/torture.c
index 8faa1a9aaeb9..17b2be9bde12 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
return false;
+ if (num_online_cpus() <= 1)
+ return false; /* Can't offline the last CPU. */
if (verbose > 1)
pr_alert("%s" TORTURE_FLAG
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <asm/tlb.h>
+
#include "trace_probe.h"
#include "trace.h"
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
+ *
+ * nmi_uaccess_okay() ensures the probe is not run in an interim
+ * state, when the task or mm are switched. This is specifically
+ * required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
+ if (unlikely(!nmi_uaccess_okay()))
+ return -EPERM;
if (!access_ok(unsafe_ptr, size))
return -EPERM;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca1ee656d6d8..ec439999f387 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -2752,12 +2754,21 @@ trace_function(struct trace_array *tr,
#ifdef CONFIG_STACKTRACE
-#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
+#define FTRACE_KSTACK_NESTING 4
+
+#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+
struct ftrace_stack {
- unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
+ unsigned long calls[FTRACE_KSTACK_ENTRIES];
+};
+
+
+struct ftrace_stacks {
+ struct ftrace_stack stacks[FTRACE_KSTACK_NESTING];
};
-static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2766,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
+ unsigned int size, nr_entries;
+ struct ftrace_stack *fstack;
struct stack_entry *entry;
- struct stack_trace trace;
- int use_stack;
- int size = FTRACE_STACK_ENTRIES;
-
- trace.nr_entries = 0;
- trace.skip = skip;
+ int stackidx;
/*
* Add one, for this function and the call to save_stack_trace()
@@ -2780,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip++;
+ skip++;
#endif
/*
@@ -2791,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+ stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
+
+ /* This should never happen. If it does, yell once and skip */
+ if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ goto out;
+
/*
- * We don't need any atomic variables, just a barrier.
- * If an interrupt comes in, we don't care, because it would
- * have exited and put the counter back to what we want.
- * We just need a barrier to keep gcc from moving things
- * around.
+ * The above __this_cpu_inc_return() is 'atomic' cpu local. An
+ * interrupt will either see the value pre increment or post
+ * increment. If the interrupt happens pre increment it will have
+ * restored the counter when it returns. We just need a barrier to
+ * keep gcc from moving things around.
*/
barrier();
- if (use_stack == 1) {
- trace.entries = this_cpu_ptr(ftrace_stack.calls);
- trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
-
- if (trace.nr_entries > size)
- size = trace.nr_entries;
- } else
- /* From now on, use_stack is a boolean */
- use_stack = 0;
+ fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
+ size = ARRAY_SIZE(fstack->calls);
- size *= sizeof(unsigned long);
+ if (regs) {
+ nr_entries = stack_trace_save_regs(regs, fstack->calls,
+ size, skip);
+ } else {
+ nr_entries = stack_trace_save(fstack->calls, size, skip);
+ }
+ size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
sizeof(*entry) + size, flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- memset(&entry->caller, 0, size);
-
- if (use_stack)
- memcpy(&entry->caller, trace.entries,
- trace.nr_entries * sizeof(unsigned long));
- else {
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.entries = entry->caller;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
- }
-
- entry->size = trace.nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
+ entry->size = nr_entries;
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2907,15 +2902,15 @@ void trace_dump_stack(int skip)
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);
-void
+static void
ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- struct stack_trace trace;
if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
@@ -2946,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
entry->tgid = current->tgid;
memset(&entry->caller, 0, sizeof(entry->caller));
- trace.nr_entries = 0;
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.skip = 0;
- trace.entries = entry->caller;
-
- save_stack_trace_user(&trace);
+ stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2960,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
out:
preempt_enable();
}
-
-#ifdef UNUSED
-static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#else /* CONFIG_USER_STACKTRACE_SUPPORT */
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc)
{
- ftrace_trace_userstack(tr, flags, preempt_count());
}
-#endif /* UNUSED */
+#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* CONFIG_STACKTRACE */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..639047b259d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
- int pc);
-
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
- unsigned long flags, int pc)
-{
-}
-
static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
int skip, int pc)
{
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
void ftrace_likely_update(struct ftrace_likely_data *f, int val,
int expect, int is_constant)
{
+ unsigned long flags = user_access_save();
+
/* A constant is always correct */
if (is_constant) {
f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
f->data.correct++;
else
f->data.incorrect++;
+
+ user_access_restore(flags);
}
EXPORT_SYMBOL(ftrace_likely_update);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 795aa2038377..a1d20421f4b0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5186,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
u64 var_ref_vals[TRACING_MAP_VARS_MAX];
char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
- struct stack_trace stacktrace;
struct hist_field *key_field;
u64 field_contents;
void *key = NULL;
@@ -5198,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
- stacktrace.entries = entries;
- stacktrace.nr_entries = 0;
- stacktrace.skip = HIST_STACKTRACE_SKIP;
-
- memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
- save_stack_trace(&stacktrace);
-
+ memset(entries, 0, HIST_STACKTRACE_SIZE);
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
key = entries;
} else {
field_contents = key_field->fn(key_field, elt, rbe, rec);
@@ -5246,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned int i;
for (i = 0; i < max_entries; i++) {
- if (stacktrace_entries[i] == ULONG_MAX)
+ if (!stacktrace_entries[i])
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,44 +18,32 @@
#include "trace.h"
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
- { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
+#define STACK_TRACE_ENTRIES 500
-/*
- * Reserve one entry for the passed in ip. This will allow
- * us to remove most or all of the stack size overhead
- * added by the stack tracer itself.
- */
-struct stack_trace stack_trace_max = {
- .max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[0],
-};
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
-unsigned long stack_trace_max_size;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned int stack_trace_nr_entries;
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
-void stack_trace_print(void)
+static void print_max_stack(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ for (i = 0; i < stack_trace_nr_entries; i++) {
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -65,16 +53,7 @@ void stack_trace_print(void)
}
}
-/*
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- * stack_trace_index[]
- * stack_trace_max
- * stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
+static void check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)
stack_trace_max_size = this_size;
- stack_trace_max.nr_entries = 0;
- stack_trace_max.skip = 0;
-
- save_stack_trace(&stack_trace_max);
+ stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
+ ARRAY_SIZE(stack_dump_trace) - 1,
+ 0);
/* Skip over the overhead of the stack tracer itself */
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
+ for (i = 0; i < stack_trace_nr_entries; i++) {
if (stack_dump_trace[i] == ip)
break;
}
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)
* Some archs may not have the passed in ip in the dump.
* If that happens, we need to show everything.
*/
- if (i == stack_trace_max.nr_entries)
+ if (i == stack_trace_nr_entries)
i = 0;
/*
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)
* loop will only happen once. This code only takes place
* on a new max, so it is far from a fast path.
*/
- while (i < stack_trace_max.nr_entries) {
+ while (i < stack_trace_nr_entries) {
int found = 0;
stack_trace_index[x] = this_size;
p = start;
- for (; p < top && i < stack_trace_max.nr_entries; p++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
+ for (; p < top && i < stack_trace_nr_entries; p++) {
/*
* The READ_ONCE_NOCHECK is used to let KASAN know that
* this is not a stack-out-of-bounds error.
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- stack_trace_max.nr_entries = x;
- for (; x < i; x++)
- stack_dump_trace[x] = ULONG_MAX;
+ stack_trace_nr_entries = x;
if (task_stack_end_corrupted(current)) {
- stack_trace_print();
+ print_max_stack();
BUG();
}
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n >= stack_trace_nr_entries)
return NULL;
m->private = (void *)n;
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
if (!stack_tracer_enabled && !stack_trace_max_size)
print_disabled(m);
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)
i = *(long *)v;
- if (i >= stack_trace_max.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (i >= stack_trace_nr_entries)
return 0;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ int was_enabled;
int ret;
mutex_lock(&stack_sysctl_mutex);
+ was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret || !write ||
- (last_stack_tracer_enabled == !!stack_tracer_enabled))
+ if (ret || !write || (was_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = !!stack_tracer_enabled;
-
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
-
out:
mutex_unlock(&stack_sysctl_mutex);
return ret;
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)
strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
stack_tracer_enabled = 1;
- last_stack_tracer_enabled = 1;
return 1;
}
__setup("stacktrace", enable_stacktrace);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6a5787233113..7f9e7b9306fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -590,7 +590,7 @@ static void lockup_detector_reconfigure(void)
* Create the watchdog thread infrastructure and configure the detector(s).
*
* The threads are not unparked as watchdog_allowed_mask is empty. When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
* unpark the threads in the watchdog_cpumask if the watchdog is enabled.
*/
static __init void lockup_detector_setup(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ddee541ea97a..56180c9286f5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
}
/**
- * wq_worker_waking_up - a worker is waking up
+ * wq_worker_running - a worker is running again
* @task: task waking up
- * @cpu: CPU @task is waking up to
*
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
- if (!(worker->flags & WORKER_NOT_RUNNING)) {
- WARN_ON_ONCE(worker->pool->cpu != cpu);
+ if (!worker->sleeping)
+ return;
+ if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
- }
+ worker->sleeping = 0;
}
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
*
- * This function is called during schedule() when a busy worker is
- * going to sleep. Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task)
+void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct worker *next, *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* checking NOT_RUNNING.
*/
if (worker->flags & WORKER_NOT_RUNNING)
- return NULL;
+ return;
pool = worker->pool;
- /* this can only happen on the local cpu */
- if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
- return NULL;
+ if (WARN_ON_ONCE(worker->sleeping))
+ return;
+
+ worker->sleeping = 1;
+ spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* lock is safe.
*/
if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- to_wakeup = first_idle_worker(pool);
- return to_wakeup ? to_wakeup->task : NULL;
+ !list_empty(&pool->worklist)) {
+ next = first_idle_worker(pool);
+ if (next)
+ wake_up_process(next->task);
+ }
+ spin_unlock_irq(&pool->lock);
}
/**
@@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
*
* WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
- * wq_worker_waking_up(). Without it, NOT_RUNNING test may
+ * wq_worker_running(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
* management operations.
*/
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index cb68b03ca89a..498de0e909a4 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
+ int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched/ and workqueue.c.
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */