diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 115 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 6 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 40 | ||||
-rw-r--r-- | kernel/sched/fair.c | 119 | ||||
-rw-r--r-- | kernel/sched/idle.c | 10 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 240 | ||||
-rw-r--r-- | kernel/sched/rt.c | 37 | ||||
-rw-r--r-- | kernel/sched/sched.h | 64 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 18 | ||||
-rw-r--r-- | kernel/sched/topology.c | 11 |
10 files changed, 399 insertions, 261 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f9a1346a5fa9..80b60ca7767f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include <asm/tlb.h> #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -1065,7 +1066,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (!p->uclamp[clamp_id].active) { + if (p->uclamp[clamp_id].active) { uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); } @@ -1073,6 +1074,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) task_rq_unlock(rq, p, &rf); } +#ifdef CONFIG_UCLAMP_TASK_GROUP static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) @@ -1091,7 +1093,6 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css, css_task_iter_end(&it); } -#ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) { @@ -1656,7 +1657,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1677,7 +1679,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -3254,7 +3255,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); @@ -3358,15 +3359,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel @@ -3929,13 +3930,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: +#ifdef CONFIG_SMP /* - * Ensure that we put DL/RT tasks before the pick loop, such that they - * can PULL higher prio tasks when we lower the RQ 'priority'. + * We must do the balancing pass before put_next_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. */ - prev->sched_class->put_prev_task(rq, prev, rf); - if (!rq->nr_running) - newidle_balance(rq, rf); + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); for_each_class(class) { p = class->pick_next_task(rq, NULL, NULL); @@ -4042,7 +4052,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -4099,9 +4113,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } @@ -4118,8 +4135,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) @@ -4223,9 +4244,8 @@ static void __sched notrace preempt_schedule_common(void) #ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4296,7 +4316,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -5103,9 +5123,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5113,45 +5130,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5351,7 +5342,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, @@ -6036,10 +6027,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; @@ -6069,7 +6061,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; @@ -6226,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next, NULL); + next->sched_class->put_prev_task(rq, next); return next; } } @@ -6430,8 +6423,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2305ce89a26c..46ed4e1383e2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc48720f189..a8a08030a8f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1691,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1758,45 +1774,28 @@ static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - struct dl_rq *dl_rq; WARN_ON_ONCE(prev || rf); - dl_rq = &rq->dl; - - if (unlikely(!dl_rq->dl_nr_running)) + if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p); - return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - } } /* @@ -2442,6 +2441,7 @@ const struct sched_class dl_sched_class = { .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP + .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..69a81a5709ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -1603,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; @@ -4354,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4376,15 +4370,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4476,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -4942,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -4994,15 +4986,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -5080,11 +5070,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6412,7 +6397,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { @@ -6585,6 +6570,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6761,7 +6755,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6899,7 +6893,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -7554,6 +7548,19 @@ static void update_blocked_averages(int cpu) update_rq_clock(rq); /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); + update_irq_load_avg(rq, 0); + + /* Don't need periodic decay once load/util_avg are null */ + if (others_have_blocked(rq)) + done = false; + + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ @@ -7580,14 +7587,6 @@ static void update_blocked_averages(int cpu) done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7648,12 +7647,18 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ curr_class = rq->curr->sched_class; update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); + + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } @@ -10429,11 +10434,11 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..f65ef1e2f204 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); @@ -366,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -376,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } @@ -461,6 +466,7 @@ const struct sched_class idle_sched_class = { .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..168479a7d61b 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,10 +30,42 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -45,17 +77,11 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,23 +96,28 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p->flags & PF_KTHREAD) + continue; + + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -101,22 +132,22 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -149,22 +174,17 @@ static int membarrier_private_expedited(int flags) */ if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -177,32 +197,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +279,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +295,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +317,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ebaa4e619684..9b8adc01be3d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1469,6 +1469,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1552,21 +1568,18 @@ static struct task_struct * pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; - struct rt_rq *rt_rq = &rq->rt; WARN_ON_ONCE(prev || rf); - if (!rt_rq->rt_queued) + if (!sched_rt_runnable(rq)) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p); - return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); @@ -1578,18 +1591,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_fla */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - } } #ifdef CONFIG_SMP @@ -2366,8 +2367,8 @@ const struct sched_class rt_sched_class = { .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..c8870c5bd7df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1723,10 +1727,11 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -1769,7 +1774,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev, NULL); + prev->sched_class->put_prev_task(rq, prev); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -1783,8 +1788,12 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) #else #define sched_class_highest (&dl_sched_class) #endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + #define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) + for_class_range(class, sched_class_highest, NULL) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1792,6 +1801,25 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} #ifdef CONFIG_SMP @@ -2438,3 +2466,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7e1cee4e65b2..c0640739e05e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } + +static int +balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return sched_stop_runnable(rq); +} #endif /* CONFIG_SMP */ static void @@ -31,16 +37,13 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop) static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *stop = rq->stop; - WARN_ON_ONCE(prev || rf); - if (!stop || !task_on_rq_queued(stop)) + if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, stop); - - return stop; + set_next_task_stop(rq, rq->stop); + return rq->stop; } static void @@ -60,7 +63,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -129,6 +132,7 @@ const struct sched_class stop_sched_class = { .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP + .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..49b835f1305f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1948,7 +1948,7 @@ next_level: static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state; + enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false; + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; @@ -2026,7 +2029,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (has_asym) - static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + static_branch_inc_cpuslocked(&sched_asym_cpucapacity); if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", @@ -2121,8 +2124,12 @@ int sched_init_domains(const struct cpumask *cpu_map) */ static void detach_destroy_domains(const struct cpumask *cpu_map) { + unsigned int cpu = cpumask_any(cpu_map); int i; + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) + static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); |