diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 441 |
1 files changed, 414 insertions, 27 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 599ee3b11b44..c7c68e6b5c51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,6 +296,15 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* + * Maximum bandwidth available for all -deadline tasks and groups + * (if group scheduling is configured) on each CPU. + * + * default: 5% + */ +unsigned int sysctl_sched_dl_period = 1000000; +int sysctl_sched_dl_runtime = 50000; + /* @@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int __dl_span_weight(struct rq *rq) +{ + return cpumask_weight(rq->rd->span); +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int __dl_span_weight(struct rq *rq) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus = __dl_span_weight(task_rq(p)); + int err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; } @@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or - * greater than deadline. + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. */ static bool __checkparam_dl(const struct sched_attr *attr) @@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr) return attr && attr->sched_deadline != 0 && (attr->sched_period == 0 || (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0; + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && + attr->sched_runtime >= (2 << (DL_SCALE - 1)); } /* @@ -3250,8 +3368,8 @@ recheck: } change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -3262,8 +3380,33 @@ change: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + cpumask_t act_affinity; + + /* + * cpus_allowed mask is statically initialized with + * CPU_MASK_ALL, span is instead dynamic. Here we + * compute the "dynamic" affinity of a task. + */ + cpumask_and(&act_affinity, &p->cpus_allowed, + cpu_active_mask); + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_equal(&act_affinity, span) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3271,6 +3414,18 @@ change: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && + dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && + !cpumask_equal(in_mask, span)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); again: @@ -4359,6 +4532,42 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* + * When dealing with a -deadline task, we have to check if moving it to + * a new CPU is possible or not. In fact, this is only true iff there + * is enough bandwidth available on such CPU, otherwise we want the + * whole migration progedure to fail over. + */ +static inline +bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu) +{ + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_bw *cpu_b = dl_bw_of(cpu); + int ret = 1; + u64 bw; + + if (dl_b == cpu_b) + return 1; + + raw_spin_lock(&dl_b->lock); + raw_spin_lock(&cpu_b->lock); + + bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span); + if (dl_bandwidth_enabled() && + bw < cpu_b->total_bw + p->dl.dl_bw) { + ret = 0; + goto unlock; + } + dl_b->total_bw -= p->dl.dl_bw; + cpu_b->total_bw += p->dl.dl_bw; + +unlock: + raw_spin_unlock(&cpu_b->lock); + raw_spin_unlock(&dl_b->lock); + + return ret; +} + +/* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're @@ -4390,6 +4599,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) goto fail; /* + * If p is -deadline, proceed only if there is enough + * bandwidth available on dest_cpu + */ + if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu)) + goto fail; + + /* * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ @@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd) if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; + init_dl_bw(&rd->dl_bw); + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6557,13 +6775,15 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_dl_period(), global_dl_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +/* + * Coupling of -rt and -deadline bandwidth. + * + * Here we check if the new -rt bandwidth value is consistent + * with the system settings for the bandwidth available + * to -deadline tasks. + * + * IOW, we want to enforce that + * + * rt_bandwidth + dl_bandwidth <= 100% + * + * is always true. + */ +static bool __sched_rt_dl_global_constraints(u64 rt_bw) +{ + unsigned long flags; + u64 dl_bw; + bool ret; + + raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags); + if (global_rt_runtime() == RUNTIME_INF || + global_dl_runtime() == RUNTIME_INF) { + ret = true; + goto unlock; + } + + dl_bw = to_ratio(def_dl_bandwidth.dl_period, + def_dl_bandwidth.dl_runtime); + + ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); +unlock: + raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags); + + return ret; +} + +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; + u64 runtime, period, bw; int ret = 0; if (sysctl_sched_rt_period <= 0) @@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void) if (runtime > period && runtime != RUNTIME_INF) return -EINVAL; + bw = to_ratio(period, runtime); + if (!__sched_rt_dl_global_constraints(bw)) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; + int i, ret = 0; + u64 bw; if (sysctl_sched_rt_period <= 0) return -EINVAL; - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + bw = to_ratio(global_rt_period(), global_rt_runtime()); + if (!__sched_rt_dl_global_constraints(bw)) { + ret = -EINVAL; + goto unlock; + } + for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -7209,12 +7461,93 @@ static int sched_rt_global_constraints(void) rt_rq->rt_runtime = global_rt_runtime(); raw_spin_unlock(&rt_rq->rt_runtime_lock); } +unlock: raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; + return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ +/* + * Coupling of -dl and -rt bandwidth. + * + * Here we check, while setting the system wide bandwidth available + * for -dl tasks and groups, if the new values are consistent with + * the system settings for the bandwidth available to -rt entities. + * + * IOW, we want to enforce that + * + * rt_bandwidth + dl_bandwidth <= 100% + * + * is always true. + */ +static bool __sched_dl_rt_global_constraints(u64 dl_bw) +{ + u64 rt_bw; + bool ret; + + raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); + if (global_dl_runtime() == RUNTIME_INF || + global_rt_runtime() == RUNTIME_INF) { + ret = true; + goto unlock; + } + + rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period), + def_rt_bandwidth.rt_runtime); + + ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); +unlock: + raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); + + return ret; +} + +static bool __sched_dl_global_constraints(u64 runtime, u64 period) +{ + if (!period || (runtime != RUNTIME_INF && runtime > period)) + return -EINVAL; + + return 0; +} + +static int sched_dl_global_constraints(void) +{ + u64 runtime = global_dl_runtime(); + u64 period = global_dl_period(); + u64 new_bw = to_ratio(period, runtime); + int ret, i; + + ret = __sched_dl_global_constraints(runtime, period); + if (ret) + return ret; + + if (!__sched_dl_rt_global_constraints(new_bw)) + return -EINVAL; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(i) { + struct dl_bw *dl_b = dl_bw_of(i); + + raw_spin_lock(&dl_b->lock); + if (new_bw < dl_b->total_bw) { + raw_spin_unlock(&dl_b->lock); + return -EBUSY; + } + raw_spin_unlock(&dl_b->lock); + } + + return 0; +} + int sched_rr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write, return ret; } +int sched_dl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + unsigned long flags; + + mutex_lock(&mutex); + old_period = sysctl_sched_dl_period; + old_runtime = sysctl_sched_dl_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, + flags); + + ret = sched_dl_global_constraints(); + if (ret) { + sysctl_sched_dl_period = old_period; + sysctl_sched_dl_runtime = old_runtime; + } else { + u64 new_bw; + int i; + + def_dl_bandwidth.dl_period = global_dl_period(); + def_dl_bandwidth.dl_runtime = global_dl_runtime(); + if (global_dl_runtime() == RUNTIME_INF) + new_bw = -1; + else + new_bw = to_ratio(global_dl_period(), + global_dl_runtime()); + /* + * FIXME: As above... + */ + for_each_possible_cpu(i) { + struct dl_bw *dl_b = dl_bw_of(i); + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } + } + + raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, + flags); + } + mutex_unlock(&mutex); + + return ret; +} + #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |