diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 303 |
1 files changed, 264 insertions, 39 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c..4cc56c91e06e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4280,14 +4280,16 @@ static inline unsigned long task_util_est(struct task_struct *p) } #ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); + return clamp(task_util_est(p), uclamp_min, uclamp_max); } #else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { return task_util_est(p); } @@ -4426,10 +4428,139 @@ done: trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, - unsigned long capacity) +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) { - return fits_capacity(uclamp_task_util(p), capacity); + unsigned long capacity_orig, capacity_orig_thermal; + unsigned long capacity = capacity_of(cpu); + bool fits, uclamp_max_fits; + + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); + + if (!uclamp_is_used()) + return fits; + + /* + * We must use capacity_orig_of() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for thermal pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + * + * In case of capacity inversion we should honour the inverted capacity + * for both uclamp_min and uclamp_max all the time. + */ + capacity_orig = cpu_in_capacity_inversion(cpu); + if (capacity_orig) { + capacity_orig_thermal = capacity_orig; + } else { + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); + } + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on cpu0 and should migrate + * to cpu1 + * * util = 80% of y then it is forced to fit on cpu1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) + fits = fits && (uclamp_min <= capacity_orig_thermal); + + return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) +{ + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4442,7 +4573,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -5862,7 +5993,10 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } static inline void update_overutilized_status(struct rq *rq) @@ -6654,21 +6788,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long task_util, best_cap = 0; + unsigned long task_util, util_min, util_max, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (fits_capacity(task_util, cpu_cap)) + if (util_fits_cpu(task_util, util_min, util_max, cpu)) return cpu; if (cpu_cap > best_cap) { @@ -6680,10 +6816,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } -static inline bool asym_fits_capacity(unsigned long task_util, int cpu) +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) { if (sched_asym_cpucap_active()) - return fits_capacity(task_util, capacity_of(cpu)); + return util_fits_cpu(util, util_min, util_max, cpu); return true; } @@ -6695,7 +6834,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { bool has_idle_core = false; struct sched_domain *sd; - unsigned long task_util; + unsigned long task_util, util_min, util_max; int i, recent_used_cpu; /* @@ -6704,7 +6843,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); } /* @@ -6713,7 +6854,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* @@ -6721,7 +6862,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + asym_fits_cpu(task_util, util_min, util_max, prev)) return prev; /* @@ -6736,7 +6877,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) in_task() && prev == smp_processor_id() && this_rq()->nr_running <= 1 && - asym_fits_capacity(task_util, prev)) { + asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev; } @@ -6748,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && - asym_fits_capacity(task_util, recent_used_cpu)) { + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } @@ -7044,6 +7185,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; struct sched_domain *sd; @@ -7068,7 +7211,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!uclamp_task_util(p, p_util_min, p_util_max)) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7076,7 +7219,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long cpu_cap, cpu_thermal_cap, util; unsigned long cur_delta, max_spare_cap = 0; - bool compute_prev_delta = false; + unsigned long rq_util_min, rq_util_max; + unsigned long util_min, util_max; + unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; @@ -7112,26 +7257,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * much capacity we can get out of the CPU; this is * aligned with sched_cpu_util(). */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (!fits_capacity(util, cpu_cap)) + if (uclamp_is_used()) { + if (uclamp_rq_is_idle(cpu_rq(cpu))) { + util_min = p_util_min; + util_max = p_util_max; + } else { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. Ie: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } + } + if (!util_fits_cpu(util, util_min, util_max, cpu)) continue; lsub_positive(&cpu_cap, util); if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ - compute_prev_delta = true; + prev_spare_cap = cpu_cap; } else if (cpu_cap > max_spare_cap) { /* * Find the CPU with the maximum spare capacity - * in the performance domain. + * among the remaining CPUs in the performance + * domain. */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } } - if (max_spare_cap_cpu < 0 && !compute_prev_delta) + if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -7139,7 +7303,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (compute_prev_delta) { + if (prev_spare_cap > 0) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ @@ -7150,7 +7314,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using max_spare_cap_cpu. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ @@ -8276,7 +8440,7 @@ static int detach_tasks(struct lb_env *env) case migrate_misfit: /* This is not a misfit task */ - if (task_fits_capacity(p, capacity_of(env->src_cpu))) + if (task_fits_cpu(p, env->src_cpu)) goto next; env->imbalance = 0; @@ -8665,16 +8829,73 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { + unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; + struct rq *rq = cpu_rq(cpu); - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); + rq->cpu_capacity_orig = capacity_orig; if (!capacity) capacity = 1; - cpu_rq(cpu)->cpu_capacity = capacity; - trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + rq->cpu_capacity = capacity; + + /* + * Detect if the performance domain is in capacity inversion state. + * + * Capacity inversion happens when another perf domain with equal or + * lower capacity_orig_of() ends up having higher capacity than this + * domain after subtracting thermal pressure. + * + * We only take into account thermal pressure in this detection as it's + * the only metric that actually results in *real* reduction of + * capacity due to performance points (OPPs) being dropped/become + * unreachable due to thermal throttling. + * + * We assume: + * * That all cpus in a perf domain have the same capacity_orig + * (same uArch). + * * Thermal pressure will impact all cpus in this perf domain + * equally. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); + struct perf_domain *pd = rcu_dereference(rq->rd->pd); + + rq->cpu_capacity_inverted = 0; + + for (; pd; pd = pd->next) { + struct cpumask *pd_span = perf_domain_span(pd); + unsigned long pd_cap_orig, pd_cap; + + cpu = cpumask_any(pd_span); + pd_cap_orig = arch_scale_cpu_capacity(cpu); + + if (capacity_orig < pd_cap_orig) + continue; + + /* + * handle the case of multiple perf domains have the + * same capacity_orig but one of them is under higher + * thermal pressure. We record it as capacity + * inversion. + */ + if (capacity_orig == pd_cap_orig) { + pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); + + if (pd_cap > inv_cap) { + rq->cpu_capacity_inverted = inv_cap; + break; + } + } else if (pd_cap_orig > inv_cap) { + rq->cpu_capacity_inverted = inv_cap; + break; + } + } + } + + trace_sched_cpu_capacity_tp(rq); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -9281,6 +9502,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, memset(sgs, 0, sizeof(*sgs)); + /* Assume that task can't fit any CPU of the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY) + sgs->group_misfit_task_load = 1; + for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -9300,12 +9525,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } + /* Check if task fits in the CPU */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load && + task_fits_cpu(p, i)) + sgs->group_misfit_task_load = 0; - /* Check if task fits in the group */ - if (sd->flags & SD_ASYM_CPUCAPACITY && - !task_fits_capacity(p, group->sgc->max_capacity)) { - sgs->group_misfit_task_load = 1; } sgs->group_capacity = group->sgc->capacity; |