diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/context_tracking.c | 6 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 160 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1195 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/topology.c | 9 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 |
6 files changed, 833 insertions, 540 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index be01a4d627c9..0296b4bda8f1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -25,8 +25,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> -DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); -EXPORT_SYMBOL_GPL(context_tracking_enabled); +DEFINE_STATIC_KEY_FALSE(context_tracking_key); +EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); @@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_branch_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_key); } if (initialized) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 46ed4e1383e2..e0cd20693ef5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + # ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) vtime_account_idle(prev); else - vtime_account_system(prev); + vtime_account_kernel(prev); vtime_flush(prev); arch_vtime_task_switch(prev); } # endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * vtime_account_kernel() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) if (!in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else - vtime_account_system(tsk); + vtime_account_kernel(tsk); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick) u64 cputime, steal; struct rq *rq = this_rq(); - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { @@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime) return delta - other; } -static void __vtime_account_system(struct task_struct *tsk, - struct vtime *vtime) +static void vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { vtime->stime += get_vtime_delta(vtime); if (vtime->stime >= TICK_NSEC) { @@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk, } } -void vtime_account_system(struct task_struct *tsk) +static void __vtime_account_kernel(struct task_struct *tsk, + struct vtime *vtime) +{ + /* We might have scheduled out from guest path */ + if (vtime->state == VTIME_GUEST) + vtime_account_guest(tsk, vtime); + else + vtime_account_system(tsk, vtime); +} + +void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - /* We might have scheduled out from guest path */ - if (tsk->flags & PF_VCPU) - vtime_account_guest(tsk, vtime); - else - __vtime_account_system(tsk, vtime); + __vtime_account_kernel(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); tsk->flags |= PF_VCPU; + vtime->state = VTIME_GUEST; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); tsk->flags &= ~PF_VCPU; + vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(get_vtime_delta(&tsk->vtime)); } -void arch_vtime_task_switch(struct task_struct *prev) +void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); + if (vtime->state == VTIME_IDLE) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; write_seqcount_end(&vtime->seqcount); vtime = ¤t->vtime; write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + if (is_idle_task(current)) + vtime->state = VTIME_IDLE; + else if (current->flags & PF_VCPU) + vtime->state = VTIME_GUEST; + else + vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); } @@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + vtime->state = VTIME_IDLE; vtime->starttime = sched_clock(); + vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -877,20 +895,102 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime = t->utime; *stime = t->stime; - /* Task is sleeping, nothing to add */ - if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + /* Task is sleeping or idle, nothing to add */ + if (vtime->state < VTIME_SYS) continue; delta = vtime_delta(vtime); /* - * Task runs either in user or kernel space, add pending nohz time to - * the right place. + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. */ - if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += vtime->utime + delta; - else if (vtime->state == VTIME_SYS) + if (vtime->state == VTIME_SYS) *stime += vtime->stime + delta; + else + *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } + +static int kcpustat_field_vtime(u64 *cpustat, + struct vtime *vtime, + enum cpu_usage_stat usage, + int cpu, u64 *val) +{ + unsigned int seq; + int err; + + do { + seq = read_seqcount_begin(&vtime->seqcount); + + /* + * We raced against context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + err = 0; + + *val = cpustat[usage]; + + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return 0; +} + +u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + u64 *cpustat = kcpustat->cpustat; + struct rq *rq; + u64 val; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) + return cpustat[usage]; + + /* Only support sys vtime for now */ + if (usage != CPUTIME_SYSTEM) + return cpustat[usage]; + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + struct vtime *vtime; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + return cpustat[usage]; + } + + vtime = &curr->vtime; + err = kcpustat_field_vtime(cpustat, vtime, usage, cpu, &val); + rcu_read_unlock(); + + if (!err) + return val; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_field); #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22a2fed29054..6e622f48c8de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1474,7 +1474,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3764,10 +3769,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; @@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: WRITE_ONCE(p->se.avg.util_est, ue); } @@ -5370,9 +5387,9 @@ static int sched_idle_cpu(int cpu) rq->nr_running); } -static unsigned long cpu_runnable_load(struct rq *rq) +static unsigned long cpu_load(struct rq *rq) { - return cfs_rq_runnable_load_avg(&rq->cfs); + return cfs_rq_load_avg(&rq->cfs); } static unsigned long capacity_of(int cpu) @@ -5380,18 +5397,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); - - if (nr_running) - return load_avg / nr_running; - - return 0; -} - static void record_wakee(struct task_struct *p) { /* @@ -5482,7 +5487,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5500,7 +5505,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5538,149 +5543,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - avg_load = 0; - runnable_load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; - this_spare = max_spare_cap; - } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. - */ - if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) - return NULL; - - if (min_runnable_load > (this_runnable_load + imbalance)) - return NULL; - - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5729,7 +5594,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this continue; } - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -5753,7 +5618,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -7079,11 +6944,26 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * group_type describes the group of CPUs at the moment of the load balance. + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the groupe_type can be simply compared when selecting the busiest + * group. see update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + group_has_spare = 0, + group_fully_busy, group_misfit_task, + group_asym_packing, group_imbalanced, - group_overloaded, + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7116,7 +6996,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; }; @@ -7339,7 +7219,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance runnable load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7347,8 +7227,8 @@ static const unsigned int sched_nr_migrate_break = 32; static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -7381,19 +7261,46 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + switch (env->migration_type) { + case migrate_load: + load = task_h_load(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if ((load / 2) > env->imbalance) - goto next; + if (load/2 > env->imbalance) + goto next; + + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + /* This is not a misfit task */ + if (task_fits_capacity(p, capacity_of(env->src_cpu))) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; #ifdef CONFIG_PREEMPTION /* @@ -7407,7 +7314,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * runnable load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -7666,14 +7573,14 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -7688,10 +7595,10 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -7702,19 +7609,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } @@ -7902,13 +7808,13 @@ static inline int sg_imbalanced(struct sched_group *group) * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7923,13 +7829,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7956,19 +7862,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(unsigned int imbalance_pct, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(imbalance_pct, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -8005,21 +7918,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_runnable_load(rq); + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8033,9 +7950,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + if (local_group) + continue; + + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -8043,17 +7967,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; + sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -8076,6 +8007,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -8084,7 +8019,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -8093,62 +8028,88 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) - return false; - - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - */ - if (sgs->sum_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - if (!sds->busiest) - return true; + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; - /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of + * idle cpus. We could also compare the spare capacity + * which is more stable but it can end up that the + * group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus >= busiest->idle_cpus) + return false; + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -8173,18 +8134,264 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @denv: The ched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + + sgs->group_load += cpu_load(rq); + sgs->group_util += cpu_util_without(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running; + + nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) + sgs->idle_cpus++; + + + } + + /* Check if task fits in the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + !task_fits_capacity(p, group->sgc->max_capacity)) { + sgs->group_misfit_task_load = 1; + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type < group_fully_busy) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + return false; + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int sd_flag) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -8211,22 +8418,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -8235,13 +8426,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { @@ -8272,203 +8465,156 @@ next_group: } /** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) -{ - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = sds->busiest_stat.group_load; - - return 1; -} - -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @env: load balance environment + * @sds: statistics of the sched_domain whose imbalance is to be calculated. */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; - if (!local->sum_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + return; + } - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. + */ + env->migration_type = migrate_task; + env->imbalance = 1; return; } /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. + * Try to use spare capacity of local group without overloading it or + * emptying busiest */ + if (local->group_type == group_has_spare) { + if (busiest->group_type > group_fully_busy) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; - - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } - - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; - - /* Move if we gain throughput */ - if (capa_move > capa_now) - env->imbalance = busiest->load_per_task; -} + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } -/** - * calculate_imbalance - Calculate the amount of imbalance present within the - * groups of a given sched_domain during load balance. - * @env: load balance environment - * @sds: statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long max_pull, load_above_capacity = ~0UL; - struct sg_lb_stats *local, *busiest; + return; + } - local = &sds->local_stat; - busiest = &sds->busiest_stat; + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_nr_running); + env->imbalance = nr_diff >> 1; + return; + } - if (busiest->group_type == group_imbalanced) { /* - * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX + * If there is no overload, we just want to even the number of + * idle cpus. */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); + return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; - return fix_small_imbalance(env, sds); - } + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ - /* - * If there aren't any idle CPUs, avoid creating some. - */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ + /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -8488,7 +8634,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); @@ -8503,17 +8649,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't @@ -8524,55 +8670,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - - /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; + if (local->group_type == group_overloaded) { + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) + goto out_balanced; + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; - if (env->idle == CPU_IDLE) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * Don't pull any tasks if this group is already above the + * domain average load. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= sds.avg_load) goto out_balanced; - } else { + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8588,11 +8759,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -8620,20 +8793,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8643,35 +8804,69 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - load = cpu_runnable_load(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. + */ + load = cpu_load(rq); - /* - * When comparing with imbalance, use cpu_runnable_load() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; + + case migrate_util: + util = cpu_util(cpu_of(rq)); + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; - /* - * For the load comparisons with the other CPU's, consider - * the cpu_runnable_load() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: load_i * capacity_j > load_j * capacity_i; where j is - * our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; } } @@ -8717,7 +8912,7 @@ voluntary_active_balance(struct lb_env *env) return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; return 0; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 2410db5e9a35..7481cd96f391 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) +SCHED_FEAT(UTIL_EST_FASTUP, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 49b835f1305f..6ec1e595b1d4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd, if (!attr || attr->relax_domain_level < 0) { if (default_relax_domain_level < 0) return; - else - request = default_relax_domain_level; + request = default_relax_domain_level; } else request = attr->relax_domain_level; - if (request < sd->level) { + + if (sd->level > request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* Turn on idle balance on this domain: */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 955851748dc3..c2748232f607 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1119,7 +1119,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the |