diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 10:07:09 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 10:07:09 -0800 |
commit | c677124e631d97130e4ff7db6e10acdfb7a82321 (patch) | |
tree | 514d9ed975d58dc077ee73a2469583ccc95a2aa1 /kernel | |
parent | c0e809e244804d428bcd976eaf9369f60508ea8a (diff) | |
parent | afa70d941f663c69c9a64ec1021bbcfa82f0e54a (diff) | |
download | linux-c677124e631d97130e4ff7db6e10acdfb7a82321.tar.bz2 |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"These were the main changes in this cycle:
- More -rt motivated separation of CONFIG_PREEMPT and
CONFIG_PREEMPTION.
- Add more low level scheduling topology sanity checks and warnings
to filter out nonsensical topologies that break scheduling.
- Extend uclamp constraints to influence wakeup CPU placement
- Make the RT scheduler more aware of asymmetric topologies and CPU
capacities, via uclamp metrics, if CONFIG_UCLAMP_TASK=y
- Make idle CPU selection more consistent
- Various fixes, smaller cleanups, updates and enhancements - please
see the git log for details"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (58 commits)
sched/fair: Define sched_idle_cpu() only for SMP configurations
sched/topology: Assert non-NUMA topology masks don't (partially) overlap
idle: fix spelling mistake "iterrupts" -> "interrupts"
sched/fair: Remove redundant call to cpufreq_update_util()
sched/psi: create /proc/pressure and /proc/pressure/{io|memory|cpu} only when psi enabled
sched/fair: Fix sgc->{min,max}_capacity calculation for SD_OVERLAP
sched/fair: calculate delta runnable load only when it's needed
sched/cputime: move rq parameter in irqtime_account_process_tick
stop_machine: Make stop_cpus() static
sched/debug: Reset watchdog on all CPUs while processing sysrq-t
sched/core: Fix size of rq::uclamp initialization
sched/uclamp: Fix a bug in propagating uclamp value in new cgroups
sched/fair: Load balance aggressively for SCHED_IDLE CPUs
sched/fair : Improve update_sd_pick_busiest for spare capacity case
watchdog: Remove soft_lockup_hrtimer_cnt and related code
sched/rt: Make RT capacity-aware
sched/fair: Make EAS wakeup placement consider uclamp restrictions
sched/fair: Make task_fits_capacity() consider uclamp restrictions
sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with()
sched/uclamp: Make uclamp util helpers use and return UL values
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.locks | 12 | ||||
-rw-r--r-- | kernel/cpu.c | 13 | ||||
-rw-r--r-- | kernel/sched/clock.c | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 34 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 25 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 4 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 15 | ||||
-rw-r--r-- | kernel/sched/debug.c | 11 | ||||
-rw-r--r-- | kernel/sched/fair.c | 171 | ||||
-rw-r--r-- | kernel/sched/idle.c | 2 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 20 | ||||
-rw-r--r-- | kernel/sched/psi.c | 10 | ||||
-rw-r--r-- | kernel/sched/rt.c | 83 | ||||
-rw-r--r-- | kernel/sched/sched.h | 24 | ||||
-rw-r--r-- | kernel/sched/topology.c | 39 | ||||
-rw-r--r-- | kernel/sched/wait_bit.c | 1 | ||||
-rw-r--r-- | kernel/stop_machine.c | 32 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
19 files changed, 315 insertions, 191 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index e0852dc333ac..3de8fd11873b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK # unlock and unlock_irq functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # or -# - DEBUG_SPINLOCK=n and PREEMPT=n +# - DEBUG_SPINLOCK=n and PREEMPTION=n # # unlock_bh and unlock_irqrestore functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y @@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH def_bool y @@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH def_bool y @@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y diff --git a/kernel/cpu.c b/kernel/cpu.c index 4dc279ed3b2d..9c706af713fb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; - /* Unpark the stopper thread and the hotplug thread of the target cpu */ - stop_machine_unpark(cpu); + /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* @@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. Wake up the controlling task which brings the - * stopper and the hotplug thread of the upcoming CPU up and then delegates - * the rest of the online bringup to the hotplug thread. + * hotplug thread of the upcoming CPU up and then delegates the rest of the + * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { @@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + /* + * Unpart the stopper thread before we start the idle loop (and start + * scheduling); this ensures the stopper task is always available. + */ + stop_machine_unpark(smp_processor_id()); + st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1152259a4ca0..12bca64dff73 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -393,7 +393,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -460,7 +460,7 @@ void __init sched_clock_init(void) u64 sched_clock_cpu(int cpu) { - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90e4b00ace89..fc1dfc007604 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) - return p->uclamp[clamp_id].value; + return (unsigned long)p->uclamp[clamp_id].value; uc_eff = uclamp_eff_get(p, clamp_id); - return uc_eff.value; + return (unsigned long)uc_eff.value; } /* @@ -1253,7 +1253,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + memset(&cpu_rq(cpu)->uclamp, 0, + sizeof(struct uclamp_rq)*UCLAMP_CNT); cpu_rq(cpu)->uclamp_flags = 0; } @@ -4504,7 +4505,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio, delta; + int old_prio; struct rq_flags rf; struct rq *rq; @@ -4538,19 +4539,18 @@ void set_user_nice(struct task_struct *p, long nice) set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } @@ -7100,6 +7100,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) if (parent) sched_online_group(tg, parent); + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* Propagate the effective uclamp value for the new group */ + cpu_util_update_eff(css); +#endif + return 0; } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9b8916fd00a2..7fbaee24c824 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util_with(rq, util, p); + util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b7abca987d94..1a2719e1350a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -46,6 +46,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -57,7 +59,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* @@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 7dc20a3232e7..32dd520db11f 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -18,7 +18,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d43318a489f2..cff3e656566d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int ticks) + int ticks) { u64 other, cputime = TICK_NSEC * ticks; @@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); - } else if (p == rq->idle) { + } else if (p == this_rq()->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); @@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, static void irqtime_account_idle_ticks(int ticks) { - struct rq *rq = this_rq(); - - irqtime_account_process_tick(current, 0, rq, ticks); + irqtime_account_process_tick(current, 0, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) { } + int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) void account_process_tick(struct task_struct *p, int user_tick) { u64 cputime, steal; - struct rq *rq = this_rq(); if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq, 1); + irqtime_account_process_tick(p, user_tick, 1); return; } @@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, cputime); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..879d3ccf3806 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void) int cpu; sched_debug_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + /* + * Need to reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI or stop_machine. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu); - + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ba749f579714..fe4e0d775375 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3520,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3556,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3614,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } else if (decayed) { @@ -3711,6 +3712,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3822,7 +3837,7 @@ done: static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return fits_capacity(task_util_est(p), capacity); + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -3857,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -5196,6 +5211,20 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +#ifdef CONFIG_SMP +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5310,6 +5339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5356,6 +5386,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5378,15 +5412,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* CPU only has SCHED_IDLE tasks enqueued */ -static int sched_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && - rq->nr_running); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -5588,7 +5613,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1, si_cpu = -1; + int shallowest_idle_cpu = -1; int i; /* Check if we have any choice: */ @@ -5597,6 +5622,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5619,12 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { - if (sched_idle_cpu(i)) { - si_cpu = i; - continue; - } - + } else if (shallowest_idle_cpu == -1) { load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5633,11 +5656,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - if (shallowest_idle_cpu != -1) - return shallowest_idle_cpu; - if (si_cpu != -1) - return si_cpu; - return least_loaded_cpu; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5790,7 +5809,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu, si_cpu = -1; + int cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5798,13 +5817,11 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } - return si_cpu; + return -1; } #else /* CONFIG_SCHED_SMT */ @@ -5828,12 +5845,13 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; s64 delta; int this = smp_processor_id(); - int cpu, nr = INT_MAX, si_cpu = -1; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5859,15 +5877,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) - return si_cpu; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; - if (available_idle_cpu(cpu)) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } time = cpu_clock(this) - time; @@ -6268,9 +6284,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) continue; @@ -6285,7 +6310,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; @@ -7780,29 +7804,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); + unsigned long cpu_cap = capacity_of(cpu); - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* @@ -8168,14 +8174,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* - * Select not overloaded group with lowest number of - * idle cpus. We could also compare the spare capacity - * which is more stable but it can end up that the - * group has less spare capacity but finally more idle + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle * CPUs which means less opportunity to pull tasks. */ - if (sgs->idle_cpus >= busiest->idle_cpus) + if (sgs->idle_cpus > busiest->idle_cpus) + return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) return false; + break; } @@ -9529,6 +9539,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9565,7 +9576,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9581,9 +9592,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); @@ -10333,6 +10345,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -10423,7 +10438,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ffa959e91227..b743bf38f08f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) /* * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only - * activity happens here and in iterrupts (if any). In that case bypass + * activity happens here and in interrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..bd006b79b360 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * Step 2 */ delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); + if (load) { + /* + * This relies on the: + * + * if (!load) + * runnable = running = 0; + * + * clause from ___update_load_sum(); this results in + * the below usage of @contrib to dissapear entirely, + * so no point in calculating it. + */ + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } } sa->period_contrib = delta; @@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages() + * update_blocked_averages(). + * + * Also see the comment in accumulate_sum(). */ if (!load) runnable = running = 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ce8f6748678a..db7b50bba3f1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = { static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + } return 0; } module_init(psi_proc_init); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e591d40fd645..4043abe45459 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + test = curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; @@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ /* @@ -2147,12 +2195,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 280a3c735935..1a88dc8ad11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2324,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } - -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return uclamp_util_with(rq, util, NULL); -} #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) -{ - return util; -} -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6ec1e595b1d4..dfb64c08a407 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1880,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve } /* + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for + * any two given CPUs at this (non-NUMA) topology level. + */ +static bool topology_span_sane(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, int cpu) +{ + int i; + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + return true; + + /* + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. + */ + for_each_cpu(i, cpu_map) { + if (i == cpu) + continue; + /* + * We should 'and' all those masks with 'cpu_map' to exactly + * match the topology we're about to build, but that can only + * remove CPUs, which only lessens our ability to detect + * overlaps + */ + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && + cpumask_intersects(tl->mask(cpu), tl->mask(i))) + return false; + } + + return true; +} + +/* * Find the sched_domain_topology_level where all CPU capacities are visible * for all CPUs. */ @@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att has_asym = true; } + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) + goto error; + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); if (tl == sched_domain_topology) diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 45eba18a2898..02ce292b9bc0 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int .bit_nr = -1, }, .wq_entry = { + .flags = flags, .private = current, .func = var_wake_function, .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1fe34a9fabc2..865bb0228ab6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; @@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -/** - * try_stop_cpus - try to stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Identical to stop_cpus() except that it fails with -EAGAIN if - * someone else is already using the facility. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -EAGAIN if someone else is already stopping cpus, -ENOENT if - * @fn(@arg) was not executed at all because all cpus in @cpumask were - * offline; otherwise, 0 if all executions of @fn returned 0, any non - * zero return value if any returned non zero. - */ -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - if (!mutex_trylock(&stop_cpus_mutex)) - return -EAGAIN; - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4bdfa270a37b..301db4406bc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2280,7 +2280,7 @@ __acquires(&pool->lock) } /* - * The following prevents a kworker from hogging CPU on !PREEMPT + * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in |