summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c303
1 files changed, 264 insertions, 39 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..4cc56c91e06e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4280,14 +4280,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
}
#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
}
#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
return task_util_est(p);
}
@@ -4426,10 +4428,139 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p,
- unsigned long capacity)
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long capacity_orig, capacity_orig_thermal;
+ unsigned long capacity = capacity_of(cpu);
+ bool fits, uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use capacity_orig_of() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for thermal pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ *
+ * In case of capacity inversion we should honour the inverted capacity
+ * for both uclamp_min and uclamp_max all the time.
+ */
+ capacity_orig = cpu_in_capacity_inversion(cpu);
+ if (capacity_orig) {
+ capacity_orig_thermal = capacity_orig;
+ } else {
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+ }
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
+ * to cpu1
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+ fits = fits && (uclamp_min <= capacity_orig_thermal);
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4442,7 +4573,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -5862,7 +5993,10 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
static inline void update_overutilized_status(struct rq *rq)
@@ -6654,21 +6788,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long task_util, best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (fits_capacity(task_util, cpu_cap))
+ if (util_fits_cpu(task_util, util_min, util_max, cpu))
return cpu;
if (cpu_cap > best_cap) {
@@ -6680,10 +6816,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
{
if (sched_asym_cpucap_active())
- return fits_capacity(task_util, capacity_of(cpu));
+ return util_fits_cpu(util, util_min, util_max, cpu);
return true;
}
@@ -6695,7 +6834,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
- unsigned long task_util;
+ unsigned long task_util, util_min, util_max;
int i, recent_used_cpu;
/*
@@ -6704,7 +6843,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/*
@@ -6713,7 +6854,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
- asym_fits_capacity(task_util, target))
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
@@ -6721,7 +6862,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_capacity(task_util, prev))
+ asym_fits_cpu(task_util, util_min, util_max, prev))
return prev;
/*
@@ -6736,7 +6877,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
- asym_fits_capacity(task_util, prev)) {
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
@@ -6748,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
- asym_fits_capacity(task_util, recent_used_cpu)) {
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
@@ -7044,6 +7185,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
struct sched_domain *sd;
@@ -7068,7 +7211,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!uclamp_task_util(p, p_util_min, p_util_max))
goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7076,7 +7219,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
- bool compute_prev_delta = false;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long util_min, util_max;
+ unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
@@ -7112,26 +7257,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used()) {
+ if (uclamp_rq_is_idle(cpu_rq(cpu))) {
+ util_min = p_util_min;
+ util_max = p_util_max;
+ } else {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+ }
+ if (!util_fits_cpu(util, util_min, util_max, cpu))
continue;
lsub_positive(&cpu_cap, util);
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
- compute_prev_delta = true;
+ prev_spare_cap = cpu_cap;
} else if (cpu_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
- * in the performance domain.
+ * among the remaining CPUs in the performance
+ * domain.
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
}
}
- if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
continue;
eenv_pd_busy_time(&eenv, cpus, p);
@@ -7139,7 +7303,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
- if (compute_prev_delta) {
+ if (prev_spare_cap > 0) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu);
/* CPU utilization has changed */
@@ -7150,7 +7314,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
- if (max_spare_cap_cpu >= 0) {
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
@@ -8276,7 +8440,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -8665,16 +8829,73 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
+ unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
+ struct rq *rq = cpu_rq(cpu);
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+ rq->cpu_capacity_orig = capacity_orig;
if (!capacity)
capacity = 1;
- cpu_rq(cpu)->cpu_capacity = capacity;
- trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+ rq->cpu_capacity = capacity;
+
+ /*
+ * Detect if the performance domain is in capacity inversion state.
+ *
+ * Capacity inversion happens when another perf domain with equal or
+ * lower capacity_orig_of() ends up having higher capacity than this
+ * domain after subtracting thermal pressure.
+ *
+ * We only take into account thermal pressure in this detection as it's
+ * the only metric that actually results in *real* reduction of
+ * capacity due to performance points (OPPs) being dropped/become
+ * unreachable due to thermal throttling.
+ *
+ * We assume:
+ * * That all cpus in a perf domain have the same capacity_orig
+ * (same uArch).
+ * * Thermal pressure will impact all cpus in this perf domain
+ * equally.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+ struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+
+ rq->cpu_capacity_inverted = 0;
+
+ for (; pd; pd = pd->next) {
+ struct cpumask *pd_span = perf_domain_span(pd);
+ unsigned long pd_cap_orig, pd_cap;
+
+ cpu = cpumask_any(pd_span);
+ pd_cap_orig = arch_scale_cpu_capacity(cpu);
+
+ if (capacity_orig < pd_cap_orig)
+ continue;
+
+ /*
+ * handle the case of multiple perf domains have the
+ * same capacity_orig but one of them is under higher
+ * thermal pressure. We record it as capacity
+ * inversion.
+ */
+ if (capacity_orig == pd_cap_orig) {
+ pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+
+ if (pd_cap > inv_cap) {
+ rq->cpu_capacity_inverted = inv_cap;
+ break;
+ }
+ } else if (pd_cap_orig > inv_cap) {
+ rq->cpu_capacity_inverted = inv_cap;
+ break;
+ }
+ }
+ }
+
+ trace_sched_cpu_capacity_tp(rq);
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
@@ -9281,6 +9502,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -9300,12 +9525,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;