From d136122f58458479fd8926020ba2937de61d7f65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2020 17:20:21 +0200 Subject: sched: Fix race against ptrace_freeze_trace() There is apparently one site that violates the rule that only current and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced() will change task->state for a remote task. Oleg explains: "TASK_TRACED/TASK_STOPPED was always protected by siglock. In particular, ttwu(__TASK_TRACED) must be always called with siglock held. That is why ptrace_freeze_traced() assumes it can safely do s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)." This breaks the ordering scheme introduced by commit: dbfb089d360b ("sched: Fix loadavg accounting race") Specifically, the reload not matching no longer implies we don't have to block. Simply things by noting that what we need is a LOAD->STORE ordering and this can be provided by a control dependency. So replace: prev_state = prev->state; raw_spin_lock(&rq->lock); smp_mb__after_spinlock(); /* SMP-MB */ if (... && prev_state && prev_state == prev->state) deactivate_task(); with: prev_state = prev->state; if (... && prev_state) /* CTRL-DEP */ deactivate_task(); Since that already implies the 'prev->state' load must be complete before allowing the 'prev->on_rq = 0' store to become visible. Fixes: dbfb089d360b ("sched: Fix loadavg accounting race") Reported-by: Jiri Slaby Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Tested-by: Paul Gortmaker Tested-by: Christian Brauner --- kernel/sched/core.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e15543cb8481..5dece9b34e25 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4119,9 +4119,6 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); - /* See deactivate_task() below. */ - prev_state = prev->state; - /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) @@ -4145,11 +4142,16 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; + /* - * We must re-load prev->state in case ttwu_remote() changed it - * before we acquired rq->lock. + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - if (!preempt && prev_state && prev_state == prev->state) { + prev_state = prev->state; + if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { @@ -4163,10 +4165,12 @@ static void __sched notrace __schedule(bool preempt) /* * __schedule() ttwu() - * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) - * LOCK rq->lock goto out; - * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); - * p->on_rq = 0; p->state = TASK_WAKING; + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. * * After this, schedule() must not care about p->state any more. */ -- cgit v1.2.3 From 062d3f95b630113e1156a31f376ad36e25da29a7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jul 2020 21:10:42 +0100 Subject: sched: Warn if garbage is passed to default_wake_function() Since the default_wake_function() passes its flags onto try_to_wake_up(), warn if those flags collide with internal values. Given that the supplied flags are garbage, no repair can be done but at least alert the user to the damage they are causing. In the belief that these errors should be picked up during testing, the warning is only compiled in under CONFIG_SCHED_DEBUG. Signed-off-by: Chris Wilson Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200723201042.18861-1-chris@chris-wilson.co.uk --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5dece9b34e25..2142c6767682 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4485,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); -- cgit v1.2.3