summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/inode.c1
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/cgroup/cgroup.c8
-rw-r--r--kernel/cpu.c512
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/exit.c29
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/cpuhotplug.c28
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/livepatch/core.c60
-rw-r--r--kernel/locking/lockdep.c48
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/sync.c9
-rw-r--r--kernel/rcu/tree.c22
-rw-r--r--kernel/sched/core.c24
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c140
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/membarrier.c34
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/smpboot.c25
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/watchdog.c643
-rw-r--r--kernel/watchdog_hld.c196
37 files changed, 1120 insertions, 924 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
putname(pname);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..8b8d6ba39e23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
{
struct bpf_verifier_state *parent = state->parent;
+ if (regno == BPF_REG_FP)
+ /* We don't need to worry about FP liveness because it's read-only */
+ return;
+
while (parent) {
/* if read wasn't screened by an earlier write ... */
if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -2345,6 +2349,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* copy register state to dest reg
*/
regs[insn->dst_reg] = regs[insn->src_reg];
+ regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..44857278eb8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2311,6 +2311,14 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..d851df22f5c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
@@ -46,11 +47,13 @@
* @bringup: Single callback bringup or teardown selector
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
- * @done: Signal completion to the issuer of the task
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
+ enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state {
bool single;
bool bringup;
struct hlist_node *node;
+ struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
- struct completion done;
+ struct completion done_up;
+ struct completion done_down;
#endif
};
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
- STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
#endif
/**
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
- * @step: The step in the state machine
+ * @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
- bool bringup, struct hlist_node *node)
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+
+ if (!(bringup ? step->startup.single : step->teardown.single))
+ return 0;
+
+ return -EAGAIN;
+ }
+
if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* Single invocation for instance add/remove */
if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
- if (ret)
- goto err;
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
cnt++;
}
+ if (lastp)
+ *lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
@@ -178,12 +226,39 @@ err:
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
- cbm(cpu, node);
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = st->state < target;
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->target = prev_state;
+ st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
+ int ret;
+
+ prev_state = cpuhp_set_state(st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
+}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE) {
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- }
- return st->result;
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
- }
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
-{
- enum cpuhp_state prev_state = st->state;
- int ret = 0;
-
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
- if (ret) {
- st->target = prev_state;
- undo_cpu_down(cpu, st);
- break;
- }
- }
- return ret;
-}
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
}
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
while (st->state < target) {
st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- init_completion(&st->done);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)
return st->should_run;
}
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
- return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- int ret = 0;
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
/*
- * Paired with the mb() in cpuhp_kick_ap_work and
- * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (!st->should_run)
+
+ if (WARN_ON_ONCE(!st->should_run))
return;
- st->should_run = false;
+ cpuhp_lock_acquire(bringup);
- lock_map_acquire(&cpuhp_state_lock_map);
- /* Single callback invocation for [un]install ? */
if (st->single) {
- if (st->cb_state < CPUHP_AP_ONLINE) {
- local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
- local_irq_enable();
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ if (bringup) {
+ st->state++;
+ state = st->state;
+ st->should_run = (st->state < st->target);
+ WARN_ON_ONCE(st->state > st->target);
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
+ state = st->state;
+ st->state--;
+ st->should_run = (st->state > st->target);
+ WARN_ON_ONCE(st->state < st->target);
}
- } else if (st->rollback) {
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
+
+ if (st->rollback) {
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ if (step->skip_onerr)
+ goto next;
+ }
+
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
- undo_cpu_down(cpu, st);
- st->rollback = false;
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
} else {
- /* Cannot happen .... */
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
- /* Regular hotplug work */
- if (st->state < st->target)
- ret = cpuhp_ap_online(cpu, st);
- else if (st->state > st->target)
- ret = cpuhp_ap_offline(cpu, st);
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
}
- lock_map_release(&cpuhp_state_lock_map);
- st->result = ret;
- complete(&st->done);
+
+next:
+ cpuhp_lock_release(bringup);
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
@@ -460,62 +594,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
if (!cpu_online(cpu))
return 0;
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, bringup, node);
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+
+ st->rollback = false;
+ st->last = NULL;
+ st->node = node;
+ st->bringup = bringup;
st->cb_state = state;
st->single = true;
- st->bringup = bringup;
- st->node = node;
- /*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
- */
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
- wait_for_completion(&st->done);
- return st->result;
-}
+ __cpuhp_kick_ap(st);
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
- st->result = 0;
- st->single = false;
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * If we failed and did a partial, do a rollback.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- enum cpuhp_state state = st->state;
+ enum cpuhp_state prev_state = st->state;
+ int ret;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
- trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- trace_cpuhp_exit(cpu, st->state, state, st->result);
- return st->result;
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
+
+ return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +717,7 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
+ int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -594,8 +731,13 @@ static int take_cpu_down(void *_param)
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -639,7 +781,7 @@ static int takedown_cpu(unsigned int cpu)
*
* Wait for the stop thread to go away.
*/
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +800,7 @@ static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
- complete(&st->done);
+ complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
@@ -676,11 +818,32 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-#else
-#define takedown_cpu NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st);
+ break;
+ }
+ }
+ return ret;
+}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +862,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = st->state;
- st->target = target;
+ prev_state = cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
@@ -720,6 +883,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
+
+ st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,13 +892,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- st->target = prev_state;
- st->rollback = true;
- cpuhp_kick_ap_work(cpu);
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
}
out:
cpus_write_unlock();
+ /*
+ * Do post unplug cleanup. This is still protected against
+ * concurrent CPU hotplug via cpu_add_remove_lock.
+ */
+ lockup_detector_cleanup();
return ret;
}
@@ -754,11 +923,15 @@ out:
cpu_maps_update_done();
return err;
}
+
int cpu_down(unsigned int cpu)
{
return do_cpu_down(cpu, CPUHP_OFFLINE);
}
EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
@@ -772,11 +945,16 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
st->state++;
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
}
@@ -794,7 +972,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
- complete(&st->done);
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1007,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- st->target = target;
+ cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1296,6 +1474,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
return 0;
@@ -1307,9 +1489,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1641,9 +1823,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
+ &dev_attr_fail.attr,
NULL
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5ee62714f9a6..04989fb769f0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgrp == event->cgrp)
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
static void free_pmu_context(struct pmu *pmu)
{
+ /*
+ * Static contexts such as perf_sw_context have a global lifetime
+ * and may be shared between different PMUs. Avoid freeing them
+ * when a single PMU is going away.
+ */
+ if (pmu->task_ctx_nr > perf_invalid_context)
+ return;
+
mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
mutex_unlock(&pmus_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..cf28528842bc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,18 +1600,19 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
+
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ goto Efault;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1723,21 +1724,23 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err && uru) {
- /* kernel_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- err = copy_to_user(uru, &ru, sizeof(ru));
- else
- err = put_compat_rusage(&ru, uru);
- if (err)
- return -EFAULT;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ goto Efault;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+#ifdef CONFIG_DEBUG_KMEMLEAK
+ /* Clear stale pointers from reused stack. */
+ memset(s->addr, 0, THREAD_SIZE);
+#endif
tsk->stack_vm_area = s;
return s->addr;
}
@@ -946,6 +950,24 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+#endif
+
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
- *
- * Must be called with the hb lock held.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ struct task_struct *owner;
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ owner = pi_state->owner;
+ if (owner) {
+ raw_spin_lock(&owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&owner->pi_lock);
+ }
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
- if (current->pi_state_cache)
+ if (current->pi_state_cache) {
kfree(pi_state);
- else {
+ } else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
continue;
}
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
+ raw_spin_unlock(&curr->pi_lock);
get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
pi_state->owner = p;
raw_spin_unlock_irq(&p->pi_lock);
@@ -2878,6 +2888,7 @@ retry:
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
put_pi_state(pi_state);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..5a2ef92c2782 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
+ irq_do_set_affinity(d, aff, false);
ret = __irq_startup(desc);
- irq_set_affinity_locked(d, aff, false);
break;
case IRQ_STARTUP_ABORT:
return 0;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
static inline bool irq_needs_fixup(struct irq_data *d)
{
const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+ unsigned int cpu = smp_processor_id();
- return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ /*
+ * The cpumask_empty() check is a workaround for interrupt chips,
+ * which do not implement effective affinity, but the architecture has
+ * enabled the config switch. Use the general affinity mask instead.
+ */
+ if (cpumask_empty(m))
+ m = irq_data_get_affinity_mask(d);
+
+ /*
+ * Sanity check. If the mask is not empty when excluding the outgoing
+ * CPU then it must contain at least one online CPU. The outgoing CPU
+ * has been removed from the online mask already.
+ */
+ if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+ cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * If this happens then there was a missed IRQ fixup at some
+ * point. Warn about it and enforce fixup.
+ */
+ pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+ cpumask_pr_args(m), d->irq, cpu);
+ return true;
+ }
+#endif
+ return cpumask_test_cpu(cpu, m);
}
static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
- d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
- void **slot;
+ void __rcu **slot;
if (d->hwirq < d->domain->revmap_size)
return; /* Not using radix tree. */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..4bff6a10ae8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+static void irq_validate_effective_affinity(struct irq_data *data)
+{
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+ if (!cpumask_empty(m))
+ return;
+ pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+ chip->name, data->irq);
+#endif
+}
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
+ irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
ret = 0;
}
@@ -1643,6 +1660,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
#endif
action = __free_irq(irq, dev_id);
+
+ if (!action)
+ return NULL;
+
devname = action->name;
kfree(action);
return devname;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (filp_epoll) {
filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
fput(filp_epoll);
- } else
+ }
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..bf8c8fd72589 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+ struct klp_patch *limit)
+{
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ if (patch == limit)
+ break;
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ /*
+ * Only unpatch the module if the patch is enabled or
+ * is in transition.
+ */
+ if (patch->enabled || patch == klp_transition_patch) {
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
+ }
+
+ klp_free_object_loaded(obj);
+ break;
+ }
+ }
+}
+
int klp_module_coming(struct module *mod)
{
int ret;
@@ -894,7 +929,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
- klp_free_object_loaded(obj);
+ klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
return ret;
@@ -902,9 +937,6 @@ err:
void klp_module_going(struct module *mod)
{
- struct klp_patch *patch;
- struct klp_object *obj;
-
if (WARN_ON(mod->state != MODULE_STATE_GOING &&
mod->state != MODULE_STATE_COMING))
return;
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
*/
mod->klp_alive = false;
- list_for_each_entry(patch, &klp_patches, list) {
- klp_for_each_object(patch, obj) {
- if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
- continue;
-
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
- }
-
- klp_free_object_loaded(obj);
- break;
- }
- }
+ klp_cleanup_module_patches_limited(mod, NULL);
mutex_unlock(&klp_mutex);
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..e36e652d996f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, int distance, struct stack_trace *trace,
int (*save)(struct stack_trace *trace))
{
+ struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- int ret;
struct lock_list this;
- struct lock_list *uninitialized_var(target_entry);
+ int ret;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(next);
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret))
+ if (unlikely(!ret)) {
+ if (!trace->entries) {
+ /*
+ * If @save fails here, the printing might trigger
+ * a WARN but because of the !nr_entries it should
+ * not do bad things.
+ */
+ save(trace);
+ }
return print_circular_bug(&this, target_entry, next, prev, trace);
+ }
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return print_bfs_bug(ret);
- if (save && !save(trace))
+ if (!trace->entries && !save(trace))
return 0;
/*
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (!ret)
return 0;
- /*
- * Debugging printouts:
- */
- if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- graph_unlock();
- printk("\n new dependency: ");
- print_lock_name(hlock_class(prev));
- printk(KERN_CONT " => ");
- print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
- dump_stack();
- if (!graph_lock())
- return 0;
- }
return 2;
}
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
struct held_lock *hlock;
- struct stack_trace trace;
- int (*save)(struct stack_trace *trace) = save_trace;
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = 0,
+ .entries = NULL,
+ .skip = 0,
+ };
/*
* Debugging checks.
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
int ret = check_prev_add(curr, hlock, next,
- distance, &trace, save);
+ distance, &trace, save_trace);
if (!ret)
return 0;
/*
- * Stop saving stack_trace if save_trace() was
- * called at least once:
- */
- if (save && ret == 2)
- save = NULL;
-
- /*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their
* own direct dependencies already, so this
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
/*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
- int error, nid, is_ram;
+ int error, nid, is_ram, i = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
+ if (!(++i % 1024))
+ cond_resched();
}
devres_add(dev, page_map);
return __va(res->start);
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
struct kernel_param p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
kernel_param_unlock(mk->mod);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
return count;
}
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..ccd2d20e6b06 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
* frozen processes + suspended devices + idle processors.
* Thus s2idle_enter() should be called right after
* all devices have been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious,
+ * so prevent them from terminating the loop right away.
*/
error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
if (!error)
s2idle_enter();
+ else if (error == -EBUSY && pm_wakeup_pending())
+ error = 0;
- dpm_noirq_resume_devices(PMSG_RESUME);
- if (error && (error != -EBUSY || !pm_wakeup_pending())) {
- dpm_noirq_end();
- break;
- }
-
- if (s2idle_ops && s2idle_ops->wake)
+ if (!error && s2idle_ops && s2idle_ops->wake)
s2idle_ops->wake();
+ dpm_noirq_resume_devices(PMSG_RESUME);
+
dpm_noirq_end();
+ if (error)
+ break;
+
if (s2idle_ops && s2idle_ops->sync)
s2idle_ops->sync();
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
* @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
* @func: function to be invoked after the SRCU grace period
*
* The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * rcu_sync_enter_start - Force readers onto slow path for multiple updates
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
* Must be called after rcu_sync_init() and before first use.
*
* Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
* This function is passed to one of the call_rcu() functions by
* rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* rcu_sync_exit(). Otherwise, set all state back to idle so that readers
* can again use their fastpaths.
*/
-static void rcu_sync_func(struct rcu_head *rcu)
+static void rcu_sync_func(struct rcu_head *rhp)
{
- struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+ struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0c44c7b42e6d..3e3650e94ae6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -884,7 +884,7 @@ void rcu_irq_exit(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
- if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ if (rdtp->dynticks_nmi_nesting)
return;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -1022,7 +1022,7 @@ void rcu_irq_enter(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
- if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ if (rdtp->dynticks_nmi_nesting)
return;
oldval = rdtp->dynticks_nesting;
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
* read-side critical sections have completed. call_rcu_sched() assumes
* that the read-side critical sections end on enabling of preemption
* or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
- * - anything that disables preemption.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
*
* These may be nested.
*
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
- * OR
- * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- * These may be nested.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *
+ * These may be nested.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..d17c5da523a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p)
put_task_stack(p);
}
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
+
void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
@@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 01217fb5a5de..2f93e4a2d9f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg)
}
#endif
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-struct llc_stats {
- unsigned long nr_running;
- unsigned long load;
- unsigned long capacity;
- int has_capacity;
-};
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
- if (!sds)
- return false;
+ if (idle_cpu(this_cpu))
+ return true;
- stats->nr_running = READ_ONCE(sds->nr_running);
- stats->load = READ_ONCE(sds->load);
- stats->capacity = READ_ONCE(sds->capacity);
- stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
- return true;
+ return false;
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct llc_stats prev_stats, this_stats;
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- if (!get_llc_stats(&prev_stats, prev_cpu) ||
- !get_llc_stats(&this_stats, this_cpu))
- return false;
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current LLC.
- */
if (sync) {
unsigned long current_load = task_h_load(current);
- /* in this case load hits 0 and this LLC is considered 'idle' */
- if (current_load > this_stats.load)
+ if (current_load > this_eff_load)
return true;
- this_stats.load -= current_load;
+ this_eff_load -= current_load;
}
- /*
- * The has_capacity stuff is not SMT aware, but by trying to balance
- * the nr_running on both ends we try and fill the domain at equal
- * rates, thereby first consuming cores before siblings.
- */
-
- /* if the old cache has capacity, stay there */
- if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
- return false;
-
- /* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
- return true;
-
- /*
- * Check to see if we can move the load without causing too much
- * imbalance.
- */
task_load = task_h_load(p);
- this_eff_load = 100;
- this_eff_load *= prev_stats.capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_stats.capacity;
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
- this_eff_load *= this_stats.load + task_load;
- prev_eff_load *= prev_stats.load - task_load;
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
return this_eff_load <= prev_eff_load;
}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine;
+ bool affine = false;
- /*
- * Default to no affine wakeups; wake_affine() should not effect a task
- * placement the load-balancer feels inclined to undo. The conservative
- * option is therefore to not move tasks when they wake up.
- */
- affine = false;
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
- /*
- * If the wakeup is across cache domains, try to evaluate if movement
- * makes sense, otherwise rely on select_idle_siblings() to do
- * placement inside the cache domain.
- */
- if (!cpus_share_cache(prev_cpu, this_cpu))
- affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && !affine)
+ affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
- if (!shared)
- return;
-
- /*
- * Since these are sums over groups they can contain some CPUs
- * multiple times for the NUMA domains.
- *
- * Currently only wake_affine_llc() and find_busiest_group()
- * uses these numbers, only the last is affected by this problem.
- *
- * XXX fix that.
- */
- WRITE_ONCE(shared->nr_running, sds->total_running);
- WRITE_ONCE(shared->load, sds->total_load);
- WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
int cpu, balance_cpu = -1;
/*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
#include <linux/membarrier.h>
#include <linux/tick.h>
#include <linux/cpumask.h>
+#include <linux/atomic.h>
#include "sched.h" /* for cpu_rq(). */
@@ -26,21 +27,26 @@
* except MEMBARRIER_CMD_QUERY.
*/
#define MEMBARRIER_CMD_BITMASK \
- (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
+ if (!(atomic_read(&current->mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+
if (num_online_cpus() == 1)
- return;
+ return 0;
/*
* Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static void membarrier_register_private_expedited(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+
+ /*
+ * We need to consider threads belonging to different thread
+ * groups, which use the same mm. (CLONE_VM but not
+ * CLONE_THREAD).
+ */
+ if (atomic_read(&mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+ return;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ &mm->membarrier_state);
}
/**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
synchronize_sched();
return 0;
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- membarrier_private_expedited();
+ return membarrier_private_expedited();
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ membarrier_register_private_expedited();
return 0;
default:
return -EINVAL;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bb3a38005b9c..0ae832e13b97 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
-void __get_seccomp_filter(struct seccomp_filter *filter)
+static void __get_seccomp_filter(struct seccomp_filter *filter)
{
/* Reference count is bounded by the number of total processes. */
refcount_inc(&filter->usage);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
* by the client, but only by calling this function.
* This function can only be called on a registered smp_hotplug_thread.
*/
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
- const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
{
struct cpumask *old = plug_thread->cpumask;
- cpumask_var_t tmp;
+ static struct cpumask tmp;
unsigned int cpu;
- if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
- return -ENOMEM;
-
- get_online_cpus();
+ lockdep_assert_cpus_held();
mutex_lock(&smpboot_threads_lock);
/* Park threads that were exclusively enabled on the old mask. */
- cpumask_andnot(tmp, old, new);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, old, new);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_park_thread(plug_thread, cpu);
/* Unpark threads that are exclusively enabled on the new mask. */
- cpumask_andnot(tmp, new, old);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, new, old);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_unpark_thread(plug_thread, cpu);
cpumask_copy(old, new);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
-
- free_cpumask_var(tmp);
-
- return 0;
}
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..d9c31bc2eaea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
@@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = {
#if defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
@@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &nmi_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = NMI_WATCHDOG_SYSCTL_PERM,
.proc_handler = proc_nmi_watchdog,
.extra1 = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
.extra2 = &one,
-#else
- .extra2 = &zero,
-#endif
},
{
.procname = "watchdog_cpumask",
@@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = {
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
{
.procname = "soft_watchdog",
- .data = &soft_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &soft_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_soft_watchdog,
.extra1 = &zero,
.extra2 = &one,
@@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..0c331978b1a6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
+ entry->prev_state = __get_task_state(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = next->state;
+ entry->next_state = __get_task_state(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
+ entry->prev_state = __get_task_state(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
+ entry->next_state = __get_task_state(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..6bcb854909c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,29 @@
#include <linux/kvm_para.h>
#include <linux/kthread.h>
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
-int __read_mostly nmi_watchdog_enabled;
+static DEFINE_MUTEX(watchdog_mutex);
#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
- NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 1
#else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 0
#endif
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
}
static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/*
* These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
{
+ hardlockup_detector_perf_enable();
return 0;
}
+
void __weak watchdog_nmi_disable(unsigned int cpu)
{
+ hardlockup_detector_perf_disable();
}
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
- * - nmi_watchdog_enabled
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+ return hardlockup_detector_perf_init();
+}
+
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
* - watchdog_thresh
* - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
*/
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
{
+ watchdog_enabled = 0;
+ if (!watchdog_user_enabled)
+ return;
+ if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ if (soft_watchdog_user_enabled)
+ watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
}
-
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
- for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static bool softlockup_threads_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
-unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
static int __init softlockup_panic_setup(char *str)
{
softlockup_panic = simple_strtoul(str, NULL, 0);
-
return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
static int __init nosoftlockup_setup(char *str)
{
- watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+ soft_watchdog_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
- sysctl_softlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
+ sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
- sysctl_hardlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
- return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
#endif
+static void __lockup_detector_cleanup(void);
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
int cpu;
/*
- * this is done lockless
- * do we care if a 0 races with a timestamp?
- * all it means is the softlock check starts one cycle later
+ * watchdog_mutex cannpt be taken here, as this might be called
+ * from (soft)interrupt context, so the access to
+ * watchdog_allowed_cpumask might race with a concurrent update.
+ *
+ * The watchdog time stamp can race against a concurrent real
+ * update as well, the only side effect might be a cycle delay for
+ * the softlockup check.
*/
- for_each_watchdog_cpu(cpu)
+ for_each_cpu(cpu, &watchdog_allowed_mask)
per_cpu(watchdog_touch_ts, cpu) = 0;
wq_watchdog_touch(-1);
}
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
- if (atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- /* Enable the perf event */
- watchdog_nmi_enable(cpu);
-
- /* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
- /* initialize timestamp */
- watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ /* Initialize timestamp */
__touch_watchdog();
+ /* Enable the perf event */
+ if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+ watchdog_nmi_enable(cpu);
+
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}
static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
-
- /*
- * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
- * failure path. Check for failures that can occur asynchronously -
- * for example, when CPUs are on-lined - and shut down the hardware
- * perf event on each CPU accordingly.
- *
- * The only non-obvious place this bit can be cleared is through
- * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
- * pr_info here would be too noisy as it would result in a message
- * every few seconds if the hardlockup was disabled but the softlockup
- * enabled.
- */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- watchdog_nmi_disable(cpu);
}
static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
+static void softlockup_update_smpboot_threads(void)
{
- int cpu, ret = 0;
+ lockdep_assert_held(&watchdog_mutex);
- atomic_set(&watchdog_park_in_progress, 1);
+ if (!softlockup_threads_initialized)
+ return;
- for_each_watchdog_cpu(cpu) {
- ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
- if (ret)
- break;
- }
-
- atomic_set(&watchdog_park_in_progress, 0);
-
- return ret;
+ smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+ &watchdog_allowed_mask);
}
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
{
- int cpu;
-
- for_each_watchdog_cpu(cpu)
- kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ cpumask_clear(&watchdog_allowed_mask);
+ softlockup_update_smpboot_threads();
}
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
{
- int ret;
-
- ret = watchdog_park_threads();
- if (ret)
- return ret;
-
- watchdog_unpark_threads();
-
- return 0;
+ cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+ softlockup_update_smpboot_threads();
}
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
{
- int err = 0;
-
- if (!watchdog_running) {
- err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
- &watchdog_cpumask);
- if (err)
- pr_err("Failed to create watchdog threads, disabled\n");
- else
- watchdog_running = 1;
- } else {
- /*
- * Enable/disable the lockup detectors or
- * change the sample period 'on the fly'.
- */
- err = update_watchdog_all_cpus();
-
- if (err) {
- watchdog_disable_all_cpus();
- pr_err("Failed to update lockup detectors, disabled\n");
- }
- }
-
- if (err)
- watchdog_enabled = 0;
-
- return err;
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ softlockup_park_all_threads();
+ set_sample_period();
+ lockup_detector_update_enable();
+ if (watchdog_enabled && watchdog_thresh)
+ softlockup_unpark_threads();
+ watchdog_nmi_start();
+ cpus_read_unlock();
+ /*
+ * Must be called outside the cpus locked section to prevent
+ * recursive locking in the perf code.
+ */
+ __lockup_detector_cleanup();
}
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty. When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
{
- if (watchdog_running) {
- watchdog_running = 0;
- smpboot_unregister_percpu_thread(&watchdog_threads);
- }
-}
+ int ret;
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
- return smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask);
-}
-#endif
+ /*
+ * If sysctl is off and watchdog got disabled on the command line,
+ * nothing to do here.
+ */
+ lockup_detector_update_enable();
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
- return 0;
-}
+ if (!IS_ENABLED(CONFIG_SYSCTL) &&
+ !(watchdog_enabled && watchdog_thresh))
+ return;
-static void watchdog_unpark_threads(void)
-{
-}
+ ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_allowed_mask);
+ if (ret) {
+ pr_err("Failed to initialize soft lockup detector threads\n");
+ return;
+ }
-static int watchdog_enable_all_cpus(void)
-{
- return 0;
+ mutex_lock(&watchdog_mutex);
+ softlockup_threads_initialized = true;
+ lockup_detector_reconfigure();
+ mutex_unlock(&watchdog_mutex);
}
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
{
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ lockup_detector_update_enable();
+ watchdog_nmi_start();
+ cpus_read_unlock();
}
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
+static inline void lockup_detector_setup(void)
{
- return 0;
+ lockup_detector_reconfigure();
}
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
{
+ lockdep_assert_held(&watchdog_mutex);
+ hardlockup_detector_perf_cleanup();
}
-#endif /* SOFTLOCKUP */
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
*/
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
{
- int ret = 0;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
- /*
- * Multiple suspend requests can be active in parallel (counted by
- * the 'watchdog_suspended' variable). If the watchdog threads are
- * running, the first caller takes care that they will be parked.
- * The state of 'watchdog_running' cannot change while a suspend
- * request is active (see related code in 'proc' handlers).
- */
- if (watchdog_running && !watchdog_suspended)
- ret = watchdog_park_threads();
-
- if (ret == 0)
- watchdog_suspended++;
- else {
- watchdog_disable_all_cpus();
- pr_err("Failed to suspend lockup detectors, disabled\n");
- watchdog_enabled = 0;
- }
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
-
- return ret;
+ mutex_lock(&watchdog_mutex);
+ __lockup_detector_cleanup();
+ mutex_unlock(&watchdog_mutex);
}
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
*/
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
{
- mutex_lock(&watchdog_proc_mutex);
-
- watchdog_suspended--;
- /*
- * The watchdog threads are unparked if they were previously running
- * and if there is no more active suspend request.
- */
- if (watchdog_running && !watchdog_suspended)
- watchdog_unpark_threads();
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ watchdog_enabled = 0;
}
#ifdef CONFIG_SYSCTL
-/*
- * Update the run state of the lockup detectors.
- */
-static int proc_watchdog_update(void)
+/* Propagate any changes to the watchdog threads */
+static void proc_watchdog_update(void)
{
- int err = 0;
-
- /*
- * Watchdog threads won't be started if they are already active.
- * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
- * care of this. If those threads are already active, the sample
- * period will be updated and the lockup detectors will be enabled
- * or disabled 'on the fly'.
- */
- if (watchdog_enabled && watchdog_thresh)
- err = watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
-
- watchdog_nmi_reconfigure();
-
- return err;
-
+ /* Remove impossible cpus to keep sysctl output clean. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+ lockup_detector_reconfigure();
}
/*
* common function for watchdog, nmi_watchdog and soft_watchdog parameter
*
- * caller | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- * | | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller | table->data points to | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
+ * | | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
- int *watchdog_param = (int *)table->data;
+ int err, old, *param = table->data;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ mutex_lock(&watchdog_mutex);
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
-
- /*
- * If the parameter is being read return the state of the corresponding
- * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
- * run state of the lockup detectors.
- */
if (!write) {
- *watchdog_param = (watchdog_enabled & which) != 0;
+ /*
+ * On read synchronize the userspace interface. This is a
+ * racy snapshot.
+ */
+ *param = (watchdog_enabled & which) != 0;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
} else {
+ old = READ_ONCE(*param);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err)
- goto out;
-
- /*
- * There is a race window between fetching the current value
- * from 'watchdog_enabled' and storing the new value. During
- * this race window, watchdog_nmi_enable() can sneak in and
- * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
- * The 'cmpxchg' detects this race and the loop retries.
- */
- do {
- old = watchdog_enabled;
- /*
- * If the parameter value is not zero set the
- * corresponding bit(s), else clear it(them).
- */
- if (*watchdog_param)
- new = old | which;
- else
- new = old & ~which;
- } while (cmpxchg(&watchdog_enabled, old, new) != old);
-
- /*
- * Update the run state of the lockup detectors. There is _no_
- * need to check the value returned by proc_watchdog_update()
- * and to restore the previous value of 'watchdog_enabled' as
- * both lockup detectors are disabled if proc_watchdog_update()
- * returns an error.
- */
- if (old == new)
- goto out;
-
- err = proc_watchdog_update();
+ if (!err && old != READ_ONCE(*param))
+ proc_watchdog_update();
}
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
int proc_nmi_watchdog(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ if (!nmi_watchdog_available && write)
+ return -ENOTSUPP;
return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ int err, old;
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
- old = ACCESS_ONCE(watchdog_thresh);
+ old = READ_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err || !write)
- goto out;
-
- /*
- * Update the sample period. Restore on failure.
- */
- new = ACCESS_ONCE(watchdog_thresh);
- if (old == new)
- goto out;
+ if (!err && write && old != READ_ONCE(watchdog_thresh))
+ proc_watchdog_update();
- set_sample_period();
- err = proc_watchdog_update();
- if (err) {
- watchdog_thresh = old;
- set_sample_period();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
{
int err;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
-
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
- if (!err && write) {
- /* Remove impossible cpus to keep sysctl output cleaner. */
- cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
- cpu_possible_mask);
-
- if (watchdog_running) {
- /*
- * Failure would be due to being unable to allocate
- * a temporary cpumask, so we are likely not in a
- * position to do much else to make things better.
- */
- if (watchdog_update_cpus() != 0)
- pr_err("cpumask update failed\n");
- }
+ if (!err && write)
+ proc_watchdog_update();
- watchdog_nmi_reconfigure();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
-
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
- set_sample_period();
-
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#endif
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
+ if (!watchdog_nmi_probe())
+ nmi_watchdog_available = true;
+ lockup_detector_setup();
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..71a62ceacdc8 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,8 +21,10 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static struct cpumask dead_events_mask;
static unsigned long hardlockup_allcpu_dumped;
+static unsigned int watchdog_cpus;
void arch_touch_nmi_watchdog(void)
{
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
+static int hardlockup_detector_event_create(void)
{
+ unsigned int cpu = smp_processor_id();
struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
- int firstcpu = 0;
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- if (atomic_inc_return(&watchdog_cpus) == 1)
- firstcpu = 1;
+ struct perf_event *evt;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ watchdog_overflow_callback, NULL);
+ if (IS_ERR(evt)) {
+ pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
+ return PTR_ERR(evt);
+ }
+ this_cpu_write(watchdog_ev, evt);
+ return 0;
+}
- /* save the first cpu's error for future comparision */
- if (firstcpu && IS_ERR(event))
- firstcpu_err = PTR_ERR(event);
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+ if (hardlockup_detector_event_create())
+ return;
- if (!IS_ERR(event)) {
- /* only print for the first cpu initialized */
- if (firstcpu || firstcpu_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
+ if (!watchdog_cpus++)
+ pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
+ perf_event_enable(this_cpu_read(watchdog_ev));
}
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ struct perf_event *event = this_cpu_read(watchdog_ev);
if (event) {
perf_event_disable(event);
+ cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+ watchdog_cpus--;
+ }
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, &dead_events_mask) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /*
+ * Required because for_each_cpu() reports unconditionally
+ * CPU0 as set on UP kernels. Sigh.
+ */
+ if (event)
+ perf_event_release_kernel(event);
per_cpu(watchdog_ev, cpu) = NULL;
+ }
+ cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_disable(event);
+ }
+}
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return;
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_enable(event);
+ }
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+ int ret = hardlockup_detector_event_create();
- /* watchdog_nmi_enable() expects this to be zero initially. */
- if (atomic_dec_and_test(&watchdog_cpus))
- firstcpu_err = 0;
+ if (ret) {
+ pr_info("Perf NMI watchdog permanently disabled\n");
+ } else {
+ perf_event_release_kernel(this_cpu_read(watchdog_ev));
+ this_cpu_write(watchdog_ev, NULL);
}
+ return ret;
}