summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 09:10:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 09:10:28 -0700
commit30c999937f69abf935b0228b8411713737377d9e (patch)
tree1ae97b8df4d61e83bdba2388b2254bb407418385 /kernel/sched/core.c
parent493ffd6605b2d3d4dc7008ab927dba319f36671f (diff)
parentfdf756f7127185eeffe00e918e66dfee797f3625 (diff)
downloadlinux-30c999937f69abf935b0228b8411713737377d9e.tar.bz2
Merge tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Debuggability: - Change most occurances of BUG_ON() to WARN_ON_ONCE() - Reorganize & fix TASK_ state comparisons, turn it into a bitmap - Update/fix misc scheduler debugging facilities Load-balancing & regular scheduling: - Improve the behavior of the scheduler in presence of lot of SCHED_IDLE tasks - in particular they should not impact other scheduling classes. - Optimize task load tracking, cleanups & fixes - Clean up & simplify misc load-balancing code Freezer: - Rewrite the core freezer to behave better wrt thawing and be simpler in general, by replacing PF_FROZEN with TASK_FROZEN & fixing/adjusting all the fallout. Deadline scheduler: - Fix the DL capacity-aware code - Factor out dl_task_is_earliest_deadline() & replenish_dl_new_period() - Relax/optimize locking in task_non_contending() Cleanups: - Factor out the update_current_exec_runtime() helper - Various cleanups, simplifications" * tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) sched: Fix more TASK_state comparisons sched: Fix TASK_state comparisons sched/fair: Move call to list_last_entry() in detach_tasks sched/fair: Cleanup loop_max and loop_break sched/fair: Make sure to try to detach at least one movable task sched: Show PF_flag holes freezer,sched: Rewrite core freezer logic sched: Widen TAKS_state literals sched/wait: Add wait_event_state() sched/completion: Add wait_for_completion_state() sched: Add TASK_ANY for wait_task_inactive() sched: Change wait_task_inactive()s match_state freezer,umh: Clean up freezer/initrd interaction freezer: Have {,un}lock_system_sleep() save/restore flags sched: Rename task_running() to task_on_cpu() sched/fair: Cleanup for SIS_PROP sched/fair: Default to false in test_idle_cores() sched/fair: Remove useless check in select_idle_core() sched/fair: Avoid double search on same cpu sched/fair: Remove redundant check in select_idle_smt() ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c103
1 files changed, 27 insertions, 76 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60fdc0faf1c9..8cd1b5a8f613 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -143,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -482,8 +478,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
- * - sched_move_task()/
- * cpu_cgroup_fork(): p->sched_task_group
+ * - sched_move_task(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
@@ -2329,7 +2324,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
@@ -2779,7 +2774,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
/*
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
@@ -3255,12 +3250,12 @@ out:
/*
* wait_task_inactive - wait for a thread to unschedule.
*
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
@@ -3291,12 +3286,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
+ * But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p)) {
- if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ while (task_on_cpu(rq, p)) {
+ if (!(READ_ONCE(p->__state) & match_state))
return 0;
cpu_relax();
}
@@ -3308,10 +3303,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
- running = task_running(rq, p);
+ running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || READ_ONCE(p->__state) == match_state)
+ if (READ_ONCE(p->__state) & match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -6430,7 +6425,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -8650,7 +8645,7 @@ again:
if (curr->sched_class != p->sched_class)
goto out_unlock;
- if (task_running(p_rq, p) || !task_is_running(p))
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8862,7 +8857,7 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
read_task_thread_flags(p));
@@ -8890,7 +8885,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
@@ -9602,9 +9597,6 @@ LIST_HEAD(task_groups);
static struct kmem_cache *task_group_cache __read_mostly;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
@@ -9648,14 +9640,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
@@ -10164,7 +10148,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10180,7 +10164,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -10211,7 +10195,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10289,53 +10273,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
-}
-
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if it's
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (READ_ONCE(task->__state) == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -11171,8 +11121,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,