summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-01 11:49:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-01 11:49:06 -0700
commitb167fdffe9e737007cbf7c691cde5fa489ca58d7 (patch)
tree05f36cf3d602cf7e69d22e2737dd4e8fb9849bfa /kernel/sched/core.c
parent0dd1cabe8a4a568252ca70f7530c3ca10e728513 (diff)
parentc17a6ff9321355487d7d5ccaa7d406a0ea06b6c4 (diff)
downloadlinux-b167fdffe9e737007cbf7c691cde5fa489ca58d7.tar.bz2
Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Load-balancing improvements: - Improve NUMA balancing on AMD Zen systems for affine workloads. - Improve the handling of reduced-capacity CPUs in load-balancing. - Energy Model improvements: fix & refine all the energy fairness metrics (PELT), and remove the conservative threshold requiring 6% energy savings to migrate a task. Doing this improves power efficiency for most workloads, and also increases the reliability of energy-efficiency scheduling. - Optimize/tweak select_idle_cpu() to spend (much) less time searching for an idle CPU on overloaded systems. There's reports of several milliseconds spent there on large systems with large workloads ... [ Since the search logic changed, there might be behavioral side effects. ] - Improve NUMA imbalance behavior. On certain systems with spare capacity, initial placement of tasks is non-deterministic, and such an artificial placement imbalance can persist for a long time, hurting (and sometimes helping) performance. The fix is to make fork-time task placement consistent with runtime NUMA balancing placement. Note that some performance regressions were reported against this, caused by workloads that are not memory bandwith limited, which benefit from the artificial locality of the placement bug(s). Mel Gorman's conclusion, with which we concur, was that consistency is better than random workload benefits from non-deterministic bugs: "Given there is no crystal ball and it's a tradeoff, I think it's better to be consistent and use similar logic at both fork time and runtime even if it doesn't have universal benefit." - Improve core scheduling by fixing a bug in sched_core_update_cookie() that caused unnecessary forced idling. - Improve wakeup-balancing by allowing same-LLC wakeup of idle CPUs for newly woken tasks. - Fix a newidle balancing bug that introduced unnecessary wakeup latencies. ABI improvements/fixes: - Do not check capabilities and do not issue capability check denial messages when a scheduler syscall doesn't require privileges. (Such as increasing niceness.) - Add forced-idle accounting to cgroups too. - Fix/improve the RSEQ ABI to not just silently accept unknown flags. (No existing tooling is known to have learned to rely on the previous behavior.) - Depreciate the (unused) RSEQ_CS_FLAG_NO_RESTART_ON_* flags. Optimizations: - Optimize & simplify leaf_cfs_rq_list() - Micro-optimize set_nr_{and_not,if}_polling() via try_cmpxchg(). Misc fixes & cleanups: - Fix the RSEQ self-tests on RISC-V and Glibc 2.35 systems. - Fix a full-NOHZ bug that can in some cases result in the tick not being re-enabled when the last SCHED_RT task is gone from a runqueue but there's still SCHED_OTHER tasks around. - Various PREEMPT_RT related fixes. - Misc cleanups & smaller fixes" * tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) rseq: Kill process when unknown flags are encountered in ABI structures rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags sched/core: Fix the bug that task won't enqueue into core tree when update cookie nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt() sched/core: Always flush pending blk_plug sched/fair: fix case with reduced capacity CPU sched/core: Use try_cmpxchg in set_nr_{and_not,if}_polling sched/core: add forced idle accounting for cgroups sched/fair: Remove the energy margin in feec() sched/fair: Remove task_util from effective utilization in feec() sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu() sched/fair: Rename select_idle_mask to select_rq_mask sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util() sched/fair: Decay task PELT values during wakeup migration sched/fair: Provide u64 read for 32-bits arch helper sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg sched: only perform capability check on privileged operation sched: Remove unused function group_first_cpu() sched/fair: Remove redundant word " *" selftests/rseq: check if libc rseq support is registered ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c215
1 files changed, 124 insertions, 91 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..5555e49c4e12 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -873,15 +873,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -890,7 +886,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -905,30 +901,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
- val = old;
}
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -3808,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3824,13 +3818,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3838,10 +3840,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -4163,7 +4162,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -4753,7 +4752,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
@@ -6500,8 +6500,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
io_wq_worker_sleeping(tsk);
}
- if (tsk_is_pi_blocked(tsk))
- return;
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -6998,17 +7002,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -7137,12 +7153,14 @@ struct task_struct *idle_task(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p)
{
- unsigned long dl_util, util, irq;
+ unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
+ max = arch_scale_cpu_capacity(cpu);
+
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
@@ -7222,10 +7240,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
- ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -7287,6 +7304,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7328,58 +7408,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -9531,7 +9564,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
void __init sched_init(void)
{
@@ -9580,7 +9613,7 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */