From c5105d764e0214bcc4c6d40d7ba231d01b2e9dda Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Nov 2019 16:37:28 +0800 Subject: sched/clock: Use static_branch_likely() with sched_clock_running sched_clock_running is enabled early at bootup stage and never disabled. So hint that to the compiler by using static_branch_likely() rather than static_branch_unlikely(). The branch probability mis-annotation was introduced in the original commit that converted the plain sched_clock_running flag to a static key: 46457ea464f5 ("sched/clock: Use static key for sched_clock_running") Steve further notes: | Looks like the confusion was the moving of the "!": | | - if (unlikely(!sched_clock_running)) | + if (!static_branch_unlikely(&sched_clock_running)) | | Where, it was unlikely that !sched_clock_running would be true, but | because the "!" was moved outside the "unlikely()" it makes the test | "likely()". That is, if we added an intermediate step, it would have | been: | | if (!likely(sched_clock_running)) | | which would have prevented the mistake that this patch fixes. [ mingo: Edited the changelog. ] Signed-off-by: Zhenzhong Duan Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1574843848-26825-1-git-send-email-zhenzhong.duan@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1152259a4ca0..12bca64dff73 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -393,7 +393,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -460,7 +460,7 @@ void __init sched_clock_init(void) u64 sched_clock_cpu(int cpu) { - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); -- cgit v1.2.3 From 1b40cd56f3bcffcfedb43bc30bd431b52240fb3b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:18 +0200 Subject: sched/rt, locking: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the Kconfig dependency to use CONFIG_PREEMPTION. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20191015191821.11479-32-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index e0852dc333ac..3de8fd11873b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK # unlock and unlock_irq functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # or -# - DEBUG_SPINLOCK=n and PREEMPT=n +# - DEBUG_SPINLOCK=n and PREEMPTION=n # # unlock_bh and unlock_irqrestore functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y @@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH def_bool y @@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH def_bool y @@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y -- cgit v1.2.3 From 025f50f3866486a5278afa91f0d3b6b780141050 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:21 +0200 Subject: sched/rt, workqueue: Use PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Update the comment to use PREEMPTION because it is true for both preemption models. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: https://lore.kernel.org/r/20191015191821.11479-35-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc88fd939f4e..bf57dc717b38 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2280,7 +2280,7 @@ __acquires(&pool->lock) } /* - * The following prevents a kworker from hogging CPU on !PREEMPT + * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in -- cgit v1.2.3 From 7c2e8bbd87db661122e92d71a394dd7bb3ada4d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:05 +0100 Subject: sched: Spare resched IPI when prio changes on a single fair task The runqueue of a fair task being remotely reniced is going to get a resched IPI in order to reassess which task should be the current running on the CPU. However that evaluation is useless if the fair task is running alone, in which case we can spare that IPI, preventing nohz_full CPUs from being disturbed. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-2-frederic@kernel.org --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08a233e97a01..846f50bd0c0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10322,6 +10322,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on -- cgit v1.2.3 From 5443a0be6121d557e12951537e10159e4c61035d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:06 +0100 Subject: sched: Use fair:prio_changed() instead of ad-hoc implementation set_user_nice() implements its own version of fair::prio_changed() and therefore misses a specific optimization towards nohz_full CPUs that avoid sending an resched IPI to a reniced task running alone. Use the proper callback instead. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-3-frederic@kernel.org --- kernel/sched/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90e4b00ace89..15508c202bf5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4540,17 +4540,17 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3 From cde65194502778665c1b52afc5722cf7dbfaa399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Dec 2019 20:19:03 +0100 Subject: sched/wait: fix ___wait_var_event(exclusive) init_wait_var_entry() forgets to initialize wq_entry->flags. Currently not a problem, we don't have wait_var_event_exclusive(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Vincent Guittot Cc: Ingo Molnar Cc: Felipe Balbi Cc: Linus Torvalds Cc: Miklos Szeredi Cc: Juri Lelli Link: https://lkml.kernel.org/r/20191210191902.GB14449@redhat.com --- kernel/sched/wait_bit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 45eba18a2898..02ce292b9bc0 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int .bit_nr = -1, }, .wq_entry = { + .flags = flags, .private = current, .func = var_wake_function, .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), -- cgit v1.2.3 From 45178ac0cea853fe0e405bf11e101bdebea57b15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Dec 2019 09:34:54 +0100 Subject: cpu/hotplug, stop_machine: Fix stop_machine vs hotplug order Paul reported a very sporadic, rcutorture induced, workqueue failure. When the planets align, the workqueue rescuer's self-migrate fails and then triggers a WARN for running a work on the wrong CPU. Tejun then figured that set_cpus_allowed_ptr()'s stop_one_cpu() call could be ignored! When stopper->enabled is false, stop_machine will insta complete the work, without actually doing the work. Worse, it will not WARN about this (we really should fix this). It turns out there is a small window where a freshly online'ed CPU is marked 'online' but doesn't yet have the stopper task running: BP AP bringup_cpu() __cpu_up(cpu, idle) --> start_secondary() ... cpu_startup_entry() bringup_wait_for_ap() wait_for_ap_thread() <-- cpuhp_online_idle() while (1) do_idle() ... available to run kthreads ... stop_machine_unpark() stopper->enable = true; Close this by moving the stop_machine_unpark() into cpuhp_online_idle(), such that the stopper thread is ready before we start the idle loop and schedule. Reported-by: "Paul E. McKenney" Debugged-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Paul E. McKenney" --- kernel/cpu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..e7f79674824d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; - /* Unpark the stopper thread and the hotplug thread of the target cpu */ - stop_machine_unpark(cpu); + /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* @@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. Wake up the controlling task which brings the - * stopper and the hotplug thread of the upcoming CPU up and then delegates - * the rest of the online bringup to the hotplug thread. + * hotplug thread of the upcoming CPU up and then delegates the rest of the + * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { @@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + /* + * Unpart the stopper thread before we start the idle loop (and start + * scheduling); this ensures the stopper task is always available. + */ + stop_machine_unpark(smp_processor_id()); + st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } -- cgit v1.2.3 From 60588bfa223ff675b95f866249f90616613fbe31 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 13 Dec 2019 10:45:30 +0800 Subject: sched/fair: Optimize select_idle_cpu select_idle_cpu() will scan the LLC domain for idle CPUs, it's always expensive. so the next commit : 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") introduces a way to limit how many CPUs we scan. But it consume some CPUs out of 'nr' that are not allowed for the task and thus waste our attempts. The function always return nr_cpumask_bits, and we can't find a CPU which our task is allowed to run. Cpumask may be too big, similar to select_idle_core(), use per_cpu_ptr 'select_idle_mask' to prevent stack overflow. Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") Signed-off-by: Cheng Jian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20191213024530.28052-1-cj.chengjian@huawei.com --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 846f50bd0c0b..280d54ccb4be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5828,6 +5828,7 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -5859,11 +5860,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return si_cpu; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; if (available_idle_cpu(cpu)) break; if (si_cpu == -1 && sched_idle_cpu(cpu)) -- cgit v1.2.3 From d040e0734fb3dedfe24c3d94f5a32b4812eca610 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 13 Dec 2019 11:45:40 +0800 Subject: schied/fair: Skip calculating @contrib without load Because of the: if (!load) runnable = running = 0; clause in ___update_load_sum(), all the actual users of @contrib in accumulate_sum(): if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; don't happen, and therefore we don't care what @contrib actually is and calculating it is pointless. If we count the times when @load equals zero and not as below: if (load) { load_is_not_zero_count++; contrib = __accumulate_pelt_segments(periods, 1024 - sa->period_contrib,delta); } else load_is_zero_count++; As we can see, load_is_zero_count is much bigger than load_is_zero_count, and the gap is gradually widening: load_is_zero_count: 6016044 times load_is_not_zero_count: 244316 times 19:50:43 up 1 min, 1 user, load average: 0.09, 0.06, 0.02 load_is_zero_count: 7956168 times load_is_not_zero_count: 261472 times 19:51:42 up 2 min, 1 user, load average: 0.03, 0.05, 0.01 load_is_zero_count: 10199896 times load_is_not_zero_count: 278364 times 19:52:51 up 3 min, 1 user, load average: 0.06, 0.05, 0.01 load_is_zero_count: 14333700 times load_is_not_zero_count: 318424 times 19:54:53 up 5 min, 1 user, load average: 0.01, 0.03, 0.00 Perhaps we can gain some performance advantage by saving these unnecessary calculation. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot < vincent.guittot@linaro.org> Link: https://lkml.kernel.org/r/1576208740-35609-1-git-send-email-rocking@linux.alibaba.com --- kernel/sched/pelt.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..bd006b79b360 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * Step 2 */ delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); + if (load) { + /* + * This relies on the: + * + * if (!load) + * runnable = running = 0; + * + * clause from ___update_load_sum(); this results in + * the below usage of @contrib to dissapear entirely, + * so no point in calculating it. + */ + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } } sa->period_contrib = delta; @@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages() + * update_blocked_averages(). + * + * Also see the comment in accumulate_sum(). */ if (!load) runnable = running = 0; -- cgit v1.2.3 From a5e37de90e67ac1072a9a44bd0cec9f5e98ded08 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 14 Dec 2019 19:51:07 +0000 Subject: stop_machine: remove try_stop_cpus helper try_stop_cpus is not used after this: commit c190c3b16c0f ("rcu: Switch synchronize_sched_expedited() to stop_one_cpu()") So remove it. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191214195107.26480-1-tiny.windzz@gmail.com --- include/linux/stop_machine.h | 7 ------- kernel/stop_machine.c | 30 ------------------------------ 2 files changed, 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index f9a0c6189852..648298f877da 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -33,7 +33,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -90,12 +89,6 @@ static inline int stop_cpus(const struct cpumask *cpumask, return -ENOENT; } -static inline int try_stop_cpus(const struct cpumask *cpumask, - cpu_stop_fn_t fn, void *arg) -{ - return stop_cpus(cpumask, fn, arg); -} - #endif /* CONFIG_SMP */ /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1fe34a9fabc2..5d68ec4c4015 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -/** - * try_stop_cpus - try to stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Identical to stop_cpus() except that it fails with -EAGAIN if - * someone else is already using the facility. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -EAGAIN if someone else is already stopping cpus, -ENOENT if - * @fn(@arg) was not executed at all because all cpus in @cpumask were - * offline; otherwise, 0 if all executions of @fn returned 0, any non - * zero return value if any returned non zero. - */ -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - if (!mutex_trylock(&stop_cpus_mutex)) - return -EAGAIN; - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); -- cgit v1.2.3 From 53a23364b6b0c679a8ecfc48e74d652f18e3631f Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Dec 2019 09:03:14 -0500 Subject: sched/core: Remove unused variable from set_user_nice() This commit left behind an unused variable: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") left behind an unused variable. kernel/sched/core.c: In function 'set_user_nice': kernel/sched/core.c:4507:16: warning: variable 'delta' set but not used int old_prio, delta; ^~~~~ Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") Link: https://lkml.kernel.org/r/20191219140314.1252-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15508c202bf5..1f6c094520e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4504,7 +4504,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio, delta; + int old_prio; struct rq_flags rf; struct rq *rq; @@ -4538,7 +4538,6 @@ void set_user_nice(struct task_struct *p, long nice) set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); -- cgit v1.2.3 From 17346452b25b98acfb395d2a82ec2e4ad0cb7a01 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Nov 2019 16:19:27 +0530 Subject: sched/fair: Make sched-idle CPU selection consistent throughout There are instances where we keep searching for an idle CPU despite already having a sched-idle CPU (in find_idlest_group_cpu(), select_idle_smt() and select_idle_cpu() and then there are places where we don't necessarily do that and return a sched-idle CPU as soon as we find one (in select_idle_sibling()). This looks a bit inconsistent and it may be worth having the same policy everywhere. On the other hand, choosing a sched-idle CPU over a idle one shall be beneficial from performance and power point of view as well, as we don't need to get the CPU online from a deep idle state which wastes quite a lot of time and energy and delays the scheduling of the newly woken up task. This patch tries to simplify code around sched-idle CPU selection and make it consistent throughout. Testing is done with the help of rt-app on hikey board (ARM64 octa-core, 2 clusters, 0-3 and 4-7). The cpufreq governor was set to performance to avoid any side affects from CPU frequency. Following are the tests performed: Test 1: 1-cfs-task: A single SCHED_NORMAL task is pinned to CPU5 which runs for 2333 us out of 7777 us (so gives time for the cluster to go in deep idle state). Test 2: 1-cfs-1-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and single SCHED_IDLE task is pinned on CPU6 (to make sure cluster 1 doesn't go in deep idle state). Test 3: 1-cfs-8-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and eight SCHED_IDLE tasks are created which run forever (not pinned anywhere, so they run on all CPUs). Checked with kernelshark that as soon as NORMAL task sleeps, the SCHED_IDLE task starts running on CPU5. And here are the results on mean latency (in us), using the "st" tool. $ st 1-cfs-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 90 592 197180 307.134 109.906 $ st 1-cfs-1-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 67 311 113850 177.336 41.4251 $ st 1-cfs-8-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 643 29 173 41364 64.3297 13.2344 The mean latency when we need to: - wakeup from deep idle state is 307 us. - wakeup from shallow idle state is 177 us. - preempt a SCHED_IDLE task is 64 us. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/b90cbcce608cef4e02a7bbfe178335f76d201bab.1573728344.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8da0222924cf..1f34fa9732d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5588,7 +5588,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1, si_cpu = -1; + int shallowest_idle_cpu = -1; int i; /* Check if we have any choice: */ @@ -5597,6 +5597,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5619,12 +5622,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { - if (sched_idle_cpu(i)) { - si_cpu = i; - continue; - } - + } else if (shallowest_idle_cpu == -1) { load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5633,11 +5631,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - if (shallowest_idle_cpu != -1) - return shallowest_idle_cpu; - if (si_cpu != -1) - return si_cpu; - return least_loaded_cpu; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5790,7 +5784,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu, si_cpu = -1; + int cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5798,13 +5792,11 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } - return si_cpu; + return -1; } #else /* CONFIG_SCHED_SMT */ @@ -5834,7 +5826,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int this = smp_processor_id(); - int cpu, nr = INT_MAX, si_cpu = -1; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5864,11 +5856,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) - return si_cpu; - if (available_idle_cpu(cpu)) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } time = cpu_clock(this) - time; -- cgit v1.2.3 From 59fe675248ffc37d4167e9ec6920a2f3d5ec67bb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:47 +0000 Subject: sched/uclamp: Remove uclamp_util() The sole user of uclamp_util(), schedutil_cpu_util(), was made to use uclamp_util_with() instead in commit: af24bde8df20 ("sched/uclamp: Add uclamp support to energy_compute()") From then on, uclamp_util() has remained unused. Being a simple wrapper around uclamp_util_with(), we can get rid of it and win back a few lines. Tested-By: Dietmar Eggemann Suggested-by: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 280a3c735935..d9b24513d71d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2324,21 +2324,12 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } - -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return uclamp_util_with(rq, util, NULL); -} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, struct task_struct *p) { return util; } -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return util; -} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity -- cgit v1.2.3 From 686516b55e98edf18c2a02d36aaaa6f4c0f6c39c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:48 +0000 Subject: sched/uclamp: Make uclamp util helpers use and return UL values Vincent pointed out recently that the canonical type for utilization values is 'unsigned long'. Internally uclamp uses 'unsigned int' values for cache optimization, but this doesn't have to be exported to its users. Make the uclamp helpers that deal with utilization use and return unsigned long values. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/sched.h | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f6c094520e0..e7b08d52db93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) - return p->uclamp[clamp_id].value; + return (unsigned long)p->uclamp[clamp_id].value; uc_eff = uclamp_eff_get(p, clamp_id); - return uc_eff.value; + return (unsigned long)uc_eff.value; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d9b24513d71d..b478474ea847 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2325,8 +2325,8 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit v1.2.3 From d2b58a286e89824900d501db0be1d4f6aed474fc Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:49 +0000 Subject: sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with() The current helper returns (CPU) rq utilization with uclamp restrictions taken into account. A uclamp task utilization helper would be quite helpful, but this requires some renaming. Prepare the code for the introduction of a uclamp_task_util() by renaming the existing uclamp_util_with() to uclamp_rq_util_with(). Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/sched.h | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9b8916fd00a2..7fbaee24c824 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util_with(rq, util, p); + util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b478474ea847..1a88dc8ad11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2303,8 +2303,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); @@ -2325,8 +2325,9 @@ unsigned long uclamp_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit v1.2.3 From a7008c07a568278ed2763436404752a98004c7ff Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:50 +0000 Subject: sched/fair: Make task_fits_capacity() consider uclamp restrictions task_fits_capacity() drives CPU selection at wakeup time, and is also used to detect misfit tasks. Right now it does so by comparing task_util_est() with a CPU's capacity, but doesn't take into account uclamp restrictions. There's a few interesting uses that can come out of doing this. For instance, a low uclamp.max value could prevent certain tasks from being flagged as misfit tasks, so they could merrily remain on low-capacity CPUs. Similarly, a high uclamp.min value would steer tasks towards high capacity CPUs at wakeup (and, should that fail, later steered via misfit balancing), so such "boosted" tasks would favor CPUs of higher capacity. Introduce uclamp_task_util() and make task_fits_capacity() use it. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-5-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f34fa9732d8..26c59bc5b2ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3822,7 +3836,7 @@ done: static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return fits_capacity(task_util_est(p), capacity); + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) -- cgit v1.2.3 From 1d42509e475cdc8542aa5b3e03a7e845244f4f57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:51 +0000 Subject: sched/fair: Make EAS wakeup placement consider uclamp restrictions task_fits_capacity() has just been made uclamp-aware, and find_energy_efficient_cpu() needs to go through the same treatment. Things are somewhat different here however - using the task max clamp isn't sufficient. Consider the following setup: The target runqueue, rq: rq.cpu_capacity_orig = 512 rq.cfs.avg.util_avg = 200 rq.uclamp.max = 768 // the max p.uclamp.max of all enqueued p's is 768 The waking task, p (not yet enqueued on rq): p.util_est = 600 p.uclamp.max = 100 Now, consider the following code which doesn't use the rq clamps: util = uclamp_task_util(p); // Does the task fit in the spare CPU capacity? cpu = cpu_of(rq); fits_capacity(util, cpu_capacity(cpu) - cpu_util(cpu)) This would lead to: util = 100; fits_capacity(100, 512 - 200) fits_capacity() would return true. However, enqueuing p on that CPU *will* cause it to become overutilized since rq clamp values are max-aggregated, so we'd remain with rq.uclamp.max = 768 which comes from the other tasks already enqueued on rq. Thus, we could select a high enough frequency to reach beyond 0.8 * 512 utilization (== overutilized) after enqueuing p on rq. What find_energy_efficient_cpu() needs here is uclamp_rq_util_with() which lets us peek at the future utilization landscape, including rq-wide uclamp values. Make find_energy_efficient_cpu() use uclamp_rq_util_with() for its fits_capacity() check. This is in line with what compute_energy() ends up using for estimating utilization. Tested-By: Dietmar Eggemann Suggested-by: Quentin Perret Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-6-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26c59bc5b2ed..2d170b5da0e3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,9 +6273,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) continue; @@ -6290,7 +6299,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; -- cgit v1.2.3 From 804d402fb6f6487b825aae8cf42fda6426c62867 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Wed, 9 Oct 2019 11:46:11 +0100 Subject: sched/rt: Make RT capacity-aware Capacity Awareness refers to the fact that on heterogeneous systems (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence when placing tasks we need to be aware of this difference of CPU capacities. In such scenarios we want to ensure that the selected CPU has enough capacity to meet the requirement of the running task. Enough capacity means here that capacity_orig_of(cpu) >= task.requirement. The definition of task.requirement is dependent on the scheduling class. For CFS, utilization is used to select a CPU that has >= capacity value than the cfs_task.util. capacity_orig_of(cpu) >= cfs_task.util DL isn't capacity aware at the moment but can make use of the bandwidth reservation to implement that in a similar manner CFS uses utilization. The following patchset implements that: https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/ capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime For RT we don't have a per task utilization signal and we lack any information in general about what performance requirement the RT task needs. But with the introduction of uclamp, RT tasks can now control that by setting uclamp_min to guarantee a minimum performance point. ATM the uclamp value are only used for frequency selection; but on heterogeneous systems this is not enough and we need to ensure that the capacity of the CPU is >= uclamp_min. Which is what implemented here. capacity_orig_of(cpu) >= rt_task.uclamp_min Note that by default uclamp.min is 1024, which means that RT tasks will always be biased towards the big CPUs, which make for a better more predictable behavior for the default case. Must stress that the bias acts as a hint rather than a definite placement strategy. For example, if all big cores are busy executing other RT tasks we can't guarantee that a new RT task will be placed there. On non-heterogeneous systems the original behavior of RT should be retained. Similarly if uclamp is not selected in the config. [ mingo: Minor edits to comments. ] Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 25 ++++++++++++++-- kernel/sched/cpupri.h | 4 ++- kernel/sched/rt.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 94 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b7abca987d94..1a2719e1350a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -46,6 +46,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -57,7 +59,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* @@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 7dc20a3232e7..32dd520db11f 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -18,7 +18,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e591d40fd645..4043abe45459 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + test = curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; @@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ /* @@ -2147,12 +2195,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) -- cgit v1.2.3 From db5793c5993d265fe6644b6638fcb0758f6b5347 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 18 Dec 2019 05:31:25 +0000 Subject: watchdog: Remove soft_lockup_hrtimer_cnt and related code After commit 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work"), the percpu soft_lockup_hrtimer_cnt is not used any more, so remove it and related code. Signed-off-by: Jisheng Zhang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191218131720.4146aea2@xhacker.debian --- kernel/watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f41334ef0971..0621301ae8cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -173,7 +173,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -350,8 +349,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); complete(this_cpu_ptr(&softlockup_completion)); -- cgit v1.2.3 From 5f68eb19b5716f8cf3ccfa833cffd1522813b0e8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 20 Dec 2019 12:04:53 +0100 Subject: sched/fair : Improve update_sd_pick_busiest for spare capacity case Similarly to calculate_imbalance() and find_busiest_group(), using the number of idle CPUs when there is only 1 CPU in the group is not efficient because we can't make a difference between a CPU running 1 task and a CPU running dozens of small tasks competing for the same CPU but not enough to overload it. More generally speaking, we should use the number of running tasks when there is the same number of idle CPUs in a group instead of blindly select the 1st one. When the groups have spare capacity and the same number of idle CPUs, we compare the number of running tasks to select the busiest group. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1576839893-26930-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d170b5da0e3..35c105759dfa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8181,14 +8181,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* - * Select not overloaded group with lowest number of - * idle cpus. We could also compare the spare capacity - * which is more stable but it can end up that the - * group has less spare capacity but finally more idle + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle * CPUs which means less opportunity to pull tasks. */ - if (sgs->idle_cpus >= busiest->idle_cpus) + if (sgs->idle_cpus > busiest->idle_cpus) return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + break; } -- cgit v1.2.3 From 323af6deaf70f204880caf94678350802682e0dc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Jan 2020 13:57:04 +0530 Subject: sched/fair: Load balance aggressively for SCHED_IDLE CPUs The fair scheduler performs periodic load balance on every CPU to check if it can pull some tasks from other busy CPUs. The duration of this periodic load balance is set to sd->balance_interval for the idle CPUs and is calculated by multiplying the sd->balance_interval with the sd->busy_factor (set to 32 by default) for the busy CPUs. The multiplication is done for busy CPUs to avoid doing load balance too often and rather spend more time executing actual task. While that is the right thing to do for the CPUs busy with SCHED_OTHER or SCHED_BATCH tasks, it may not be the optimal thing for CPUs running only SCHED_IDLE tasks. With the recent enhancements in the fair scheduler around SCHED_IDLE CPUs, we now prefer to enqueue a newly-woken task to a SCHED_IDLE CPU instead of other busy or idle CPUs. The same reasoning should be applied to the load balancer as well to make it migrate tasks more aggressively to a SCHED_IDLE CPU, as that will reduce the scheduling latency of the migrated (SCHED_OTHER) tasks. This patch makes minimal changes to the fair scheduler to do the next load balance soon after the last non SCHED_IDLE task is dequeued from a runqueue, i.e. making the CPU SCHED_IDLE. Also the sd->busy_factor is ignored while calculating the balance_interval for such CPUs. This is done to avoid delaying the periodic load balance by few hundred milliseconds for SCHED_IDLE CPUs. This is tested on ARM64 Hikey620 platform (octa-core) with the help of rt-app and it is verified, using kernel traces, that the newly SCHED_IDLE CPU does load balancing shortly after it becomes SCHED_IDLE and pulls tasks from other busy CPUs. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/e485827eb8fe7db0943d6f3f6e0f5a4a70272781.1578471925.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35c105759dfa..d292883694b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5210,6 +5210,18 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5324,6 +5336,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5370,6 +5383,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5392,15 +5409,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* CPU only has SCHED_IDLE tasks enqueued */ -static int sched_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && - rq->nr_running); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -9546,6 +9554,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9582,7 +9591,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9598,9 +9607,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); -- cgit v1.2.3 From 7226017ad37a888915628e59a84a2d1e57b40707 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 24 Dec 2019 11:54:04 +0000 Subject: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups When a new cgroup is created, the effective uclamp value wasn't updated with a call to cpu_util_update_eff() that looks at the hierarchy and update to the most restrictive values. Fix it by ensuring to call cpu_util_update_eff() when a new cgroup becomes online. Without this change, the newly created cgroup uses the default root_task_group uclamp values, which is 1024 for both uclamp_{min, max}, which will cause the rq to to be clamped to max, hence cause the system to run at max frequency. The problem was observed on Ubuntu server and was reproduced on Debian and Buildroot rootfs. By default, Ubuntu and Debian create a cpu controller cgroup hierarchy and add all tasks to it - which creates enough noise to keep the rq uclamp value at max most of the time. Imitating this behavior makes the problem visible in Buildroot too which otherwise looks fine since it's a minimal userspace. Fixes: 0b60ba2dd342 ("sched/uclamp: Propagate parent clamps") Reported-by: Doug Smythies Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Doug Smythies Link: https://lore.kernel.org/lkml/000701d5b965$361b6c60$a2524520$@net/ --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7b08d52db93..d0270b14c132 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7099,6 +7099,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) if (parent) sched_online_group(tg, parent); + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* Propagate the effective uclamp value for the new group */ + cpu_util_update_eff(css); +#endif + return 0; } -- cgit v1.2.3 From dcd6dffb0a75741471297724640733fa4e958d72 Mon Sep 17 00:00:00 2001 From: Li Guanglei Date: Wed, 25 Dec 2019 15:44:04 +0800 Subject: sched/core: Fix size of rq::uclamp initialization rq::uclamp is an array of struct uclamp_rq, make sure we clear the whole thing. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcountinga") Signed-off-by: Li Guanglei Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Link: https://lkml.kernel.org/r/1577259844-12677-1-git-send-email-guangleix.li@gmail.com --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0270b14c132..fc1dfc007604 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1253,7 +1253,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + memset(&cpu_rq(cpu)->uclamp, 0, + sizeof(struct uclamp_rq)*UCLAMP_CNT); cpu_rq(cpu)->uclamp_flags = 0; } -- cgit v1.2.3 From 02d4ac5885a18d326b500b94808f0956dcce2832 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 26 Dec 2019 16:52:24 +0800 Subject: sched/debug: Reset watchdog on all CPUs while processing sysrq-t Lengthy output of sysrq-t may take a lot of time on slow serial console with lots of processes and CPUs. So we need to reset NMI-watchdog to avoid spurious lockup messages, and we also reset softlockup watchdogs on all other CPUs since another CPU might be blocked waiting for us to process an IPI or stop_machine. Add to sysrq_sched_debug_show() as what we did in show_state_filter(). Signed-off-by: Wei Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20191226085224.48942-1-liwei391@huawei.com --- kernel/sched/debug.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..879d3ccf3806 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void) int cpu; sched_debug_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + /* + * Need to reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI or stop_machine. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu); - + } } /* -- cgit v1.2.3 From 35f4cd96f5551dc1b2641159e7bb7bf91de6600f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 28 Dec 2019 16:19:12 +0000 Subject: stop_machine: Make stop_cpus() static The function stop_cpus() is only used internally by the stop_machine for stop multiple cpus. Make it static. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191228161912.24082-1-tiny.windzz@gmail.com --- include/linux/stop_machine.h | 9 --------- kernel/stop_machine.c | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 648298f877da..76d8b09384a7 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -32,7 +32,6 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -81,14 +80,6 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } -static inline int stop_cpus(const struct cpumask *cpumask, - cpu_stop_fn_t fn, void *arg) -{ - if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) - return stop_one_cpu(raw_smp_processor_id(), fn, arg); - return -ENOENT; -} - #endif /* CONFIG_SMP */ /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d68ec4c4015..865bb0228ab6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; -- cgit v1.2.3 From 9dec1b6949ae9509cdc3edb2d75fda39c9db9fa2 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 2 Jan 2020 18:07:52 +0800 Subject: sched/cputime: move rq parameter in irqtime_account_process_tick Every time we call irqtime_account_process_tick() is in a interrupt, Every caller will get and assign a parameter rq = this_rq(), This is unnecessary and increase the code size a little bit. Move the rq getting action to irqtime_account_process_tick internally is better. base with this patch cputime.o 578792 bytes 577888 bytes Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1577959674-255537-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/sched/cputime.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d43318a489f2..cff3e656566d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int ticks) + int ticks) { u64 other, cputime = TICK_NSEC * ticks; @@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); - } else if (p == rq->idle) { + } else if (p == this_rq()->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); @@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, static void irqtime_account_idle_ticks(int ticks) { - struct rq *rq = this_rq(); - - irqtime_account_process_tick(current, 0, rq, ticks); + irqtime_account_process_tick(current, 0, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) { } + int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) void account_process_tick(struct task_struct *p, int user_tick) { u64 cputime, steal; - struct rq *rq = this_rq(); if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq, 1); + irqtime_account_process_tick(p, user_tick, 1); return; } @@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, cputime); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); -- cgit v1.2.3 From fe71bbb21ee14160f73f81b113d71145327a1c0d Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 3 Jan 2020 19:44:00 +0800 Subject: sched/fair: calculate delta runnable load only when it's needed Move the code of calculation for delta_sum/delta_avg to where it is really needed to be done. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200103114400.17668-1-rocking@linux.alibaba.com --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d292883694b7..32c5421b6a25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) -- cgit v1.2.3 From 4c58f57fa6e93318a0899f70d8b99fe6bac22ce8 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 4 Jan 2020 21:08:28 +0800 Subject: sched/fair: Fix sgc->{min,max}_capacity calculation for SD_OVERLAP commit bf475ce0a3dd ("sched/fair: Add per-CPU min capacity to sched_group_capacity") introduced per-cpu min_capacity. commit e3d6d0cb66f2 ("sched/fair: Add sched_group per-CPU max capacity") introduced per-cpu max_capacity. In the SD_OVERLAP case, the local variable 'capacity' represents the sum of CPU capacity of all CPUs in the first sched group (sg) of the sched domain (sd). It is erroneously used to calculate sg's min and max CPU capacity. To fix this use capacity_of(cpu) instead of 'capacity'. The code which achieves this via cpu_rq(cpu)->sd->groups->sgc->capacity (for rq->sd != NULL) can be removed since it delivers the same value as capacity_of(cpu) which is currently only used for the (!rq->sd) case (see update_cpu_capacity()). An sg of the lowest sd (rq->sd or sd->child == NULL) represents a single CPU (and hence sg->sgc->capacity == capacity_of(cpu)). Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200104130828.GA7718@iZj6chx1xj0e0buvshuecpZ --- kernel/sched/fair.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 32c5421b6a25..e84723c5c661 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7802,29 +7802,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); + unsigned long cpu_cap = capacity_of(cpu); - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* -- cgit v1.2.3 From 3d817689a62cf71bbb290af18cd26cf9764f38fe Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 18 Dec 2019 20:38:18 +0800 Subject: sched/psi: create /proc/pressure and /proc/pressure/{io|memory|cpu} only when psi enabled when CONFIG_PSI_DEFAULT_DISABLED set to N or the command line set psi=0, I think we should not create /proc/pressure and /proc/pressure/{io|memory|cpu}. In the future, user maybe determine whether the psi feature is enabled by checking the existence of the /proc/pressure dir or /proc/pressure/{io|memory|cpu} files. Signed-off-by: Wang Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/1576672698-32504-1-git-send-email-w@laoqinren.net --- kernel/sched/psi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ce8f6748678a..db7b50bba3f1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = { static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + } return 0; } module_init(psi_proc_init); -- cgit v1.2.3 From a4f9a0e51bbf89cb461b1985a1a570e6b87da3b5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 15 Jan 2020 11:20:20 +0100 Subject: sched/fair: Remove redundant call to cpufreq_update_util() With commit bef69dd87828 ("sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()") update_load_avg() has become the central point for calling cpufreq (not including the update of blocked load). This change helps to simplify further the number of calls to cpufreq_update_util() and to remove last redundant ones. With update_load_avg(), we are now sure that cpufreq_update_util() will be called after every task attachment to a cfs_rq and especially after propagating this event down to the util_avg of the root cfs_rq, which is the level that is used by cpufreq governors like schedutil to set the frequency of a CPU. The SCHED_CPUFREQ_MIGRATION flag forces an early call to cpufreq when the migration happens in a cgroup whereas util_avg of root cfs_rq is not yet updated and this call is duplicated with the one that happens immediately after when the migration event reaches the root cfs_rq. The dedicated flag SCHED_CPUFREQ_MIGRATION is now useless and can be removed. The interface of attach_entity_load_avg() can also be simplified accordingly. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/1579083620-24943-1-git-send-email-vincent.guittot@linaro.org --- include/linux/sched/cpufreq.h | 1 - kernel/sched/fair.c | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index cc6bcc1e96bc..3ed5aa18593f 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,7 +9,6 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) -#define SCHED_CPUFREQ_MIGRATION (1U << 1) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e84723c5c661..ebf50955fe8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3521,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3557,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3615,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } else if (decayed) { @@ -3872,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -10436,7 +10436,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } -- cgit v1.2.3 From 3e0de271fff77abb933f1b69c213854c3eda9125 Mon Sep 17 00:00:00 2001 From: Hewenliang Date: Thu, 9 Jan 2020 21:56:04 -0500 Subject: idle: fix spelling mistake "iterrupts" -> "interrupts" There is a spelling misake in comments of cpuidle_idle_call. Fix it. Signed-off-by: Hewenliang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200110025604.34373-1-hewenliang4@huawei.com --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ffa959e91227..b743bf38f08f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) /* * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only - * activity happens here and in iterrupts (if any). In that case bypass + * activity happens here and in interrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle -- cgit v1.2.3 From ccf74128d66ce937876184ad55db2e0276af08d3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Jan 2020 16:09:15 +0000 Subject: sched/topology: Assert non-NUMA topology masks don't (partially) overlap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit topology.c::get_group() relies on the assumption that non-NUMA domains do not partially overlap. Zeng Tao pointed out in [1] that such topology descriptions, while completely bogus, can end up being exposed to the scheduler. In his example (8 CPUs, 2-node system), we end up with: MC span for CPU3 == 3-7 MC span for CPU4 == 4-7 The first pass through get_group(3, sdd@MC) will result in the following sched_group list: 3 -> 4 -> 5 -> 6 -> 7 ^ / `----------------' And a later pass through get_group(4, sdd@MC) will "corrupt" that to: 3 -> 4 -> 5 -> 6 -> 7 ^ / `-----------' which will completely break things like 'while (sg != sd->groups)' when using CPU3's base sched_domain. There already are some architecture-specific checks in place such as x86/kernel/smpboot.c::topology.sane(), but this is something we can detect in the core scheduler, so it seems worthwhile to do so. Warn and abort the construction of the sched domains if such a broken topology description is detected. Note that this is somewhat expensive (O(t.c²), 't' non-NUMA topology levels and 'c' CPUs) and could be gated under SCHED_DEBUG if deemed necessary. Testing ======= Dietmar managed to reproduce this using the following qemu incantation: $ qemu-system-aarch64 -kernel ./Image -hda ./qemu-image-aarch64.img \ -append 'root=/dev/vda console=ttyAMA0 loglevel=8 sched_debug' -smp \ cores=8 --nographic -m 512 -cpu cortex-a53 -machine virt -numa \ node,cpus=0-2,nodeid=0 -numa node,cpus=3-7,nodeid=1 alongside the following drivers/base/arch_topology.c hack (AIUI wouldn't be needed if '-smp cores=X, sockets=Y' would work with qemu): 8<--- @@ -465,6 +465,9 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if ((cpu < 4 && cpuid > 3) || (cpu > 3 && cpuid < 4)) + continue; + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); 8<--- [1]: https://lkml.kernel.org/r/1577088979-8545-1-git-send-email-prime.zeng@hisilicon.com Reported-by: Zeng Tao Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200115160915.22575-1-valentin.schneider@arm.com --- kernel/sched/topology.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6ec1e595b1d4..dfb64c08a407 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1879,6 +1879,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for + * any two given CPUs at this (non-NUMA) topology level. + */ +static bool topology_span_sane(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, int cpu) +{ + int i; + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + return true; + + /* + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. + */ + for_each_cpu(i, cpu_map) { + if (i == cpu) + continue; + /* + * We should 'and' all those masks with 'cpu_map' to exactly + * match the topology we're about to build, but that can only + * remove CPUs, which only lessens our ability to detect + * overlaps + */ + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && + cpumask_intersects(tl->mask(cpu), tl->mask(i))) + return false; + } + + return true; +} + /* * Find the sched_domain_topology_level where all CPU capacities are visible * for all CPUs. @@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att has_asym = true; } + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) + goto error; + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); if (tl == sched_domain_topology) -- cgit v1.2.3 From afa70d941f663c69c9a64ec1021bbcfa82f0e54a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 20 Jan 2020 11:29:05 +0530 Subject: sched/fair: Define sched_idle_cpu() only for SMP configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sched_idle_cpu() isn't used for non SMP configuration and with a recent change, we have started getting following warning: kernel/sched/fair.c:5221:12: warning: ‘sched_idle_cpu’ defined but not used [-Wunused-function] Fix that by defining sched_idle_cpu() only for SMP configurations. Fixes: 323af6deaf70 ("sched/fair: Load balance aggressively for SCHED_IDLE CPUs") Reported-by: Stephen Rothwell Signed-off-by: Viresh Kumar Signed-off-by: Ingo Molnar Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/f0554f590687478b33914a4aff9f0e6a62886d44.1579499907.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebf50955fe8a..fe4e0d775375 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5218,10 +5218,12 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } +#endif /* * The enqueue_task method is called before nr_running is -- cgit v1.2.3