diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-14 09:39:03 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-14 09:39:03 -0800 |
commit | fc661f2dcb7e41dcda9ae862efb822bb2f461646 (patch) | |
tree | 2aae84cc82d8c0e2b390d313ea36c2550327da49 /kernel | |
parent | f7018be29253b89175d03284f8f49ac4ffed0472 (diff) | |
parent | a8b76910e465d718effce0cad306a21fa4f3526b (diff) | |
download | linux-fc661f2dcb7e41dcda9ae862efb822bb2f461646.tar.bz2 |
Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov:
- Avoid touching ~100 config files in order to be able to select the
preemption model
- clear cluster CPU masks too, on the CPU unplug path
- prevent use-after-free in cfs
- Prevent a race condition when updating CPU cache domains
- Factor out common shared part of smp_prepare_cpus() into a common
helper which can be called by both baremetal and Xen, in order to fix
a booting of Xen PV guests
* tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
preempt: Restore preemption model selection configs
arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology()
sched/fair: Prevent dead task groups from regaining cfs_rq's
sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain()
x86/smp: Factor out parts of native_smp_prepare_cpus()
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 42 | ||||
-rw-r--r-- | kernel/sched/autogroup.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 53 | ||||
-rw-r--r-- | kernel/sched/fair.c | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c | 12 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 |
6 files changed, 76 insertions, 40 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 60f1bfc3c7b2..ce77f0265660 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,12 +1,23 @@ # SPDX-License-Identifier: GPL-2.0-only +config PREEMPT_NONE_BUILD + bool + +config PREEMPT_VOLUNTARY_BUILD + bool + +config PREEMPT_BUILD + bool + select PREEMPTION + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + choice prompt "Preemption Model" - default PREEMPT_NONE_BEHAVIOUR + default PREEMPT_NONE -config PREEMPT_NONE_BEHAVIOUR +config PREEMPT_NONE bool "No Forced Preemption (Server)" - select PREEMPT_NONE if !PREEMPT_DYNAMIC + select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards throughput. It will still provide good latencies most of the @@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR raw processing power of the kernel, irrespective of scheduling latencies. -config PREEMPT_VOLUNTARY_BEHAVIOUR +config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC + select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR Select this if you are building a kernel for a desktop system. -config PREEMPT_BEHAVIOUR +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT + select PREEMPT_BUILD help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC + depends on EXPERT && ARCH_SUPPORTS_RT select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -75,17 +86,6 @@ config PREEMPT_RT endchoice -config PREEMPT_NONE - bool - -config PREEMPT_VOLUNTARY - bool - -config PREEMPT - bool - select PREEMPTION - select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK - config PREEMPT_COUNT bool @@ -95,8 +95,8 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC - select PREEMPT + depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select PREEMPT_BUILD default y help This option allows to define the preemption model on the kernel diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2067080bb235..8629b37d118e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif - sched_offline_group(ag->tg); + sched_release_group(ag->tg); sched_destroy_group(ag->tg); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 523fd602ea90..3c9b0fda64ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3726,6 +3726,9 @@ out: bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode); static void __init preempt_dynamic_init(void) { if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); } else { /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); preempt_dynamic_mode = preempt_dynamic_full; pr_info("Dynamic Preempt: full\n"); } @@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +static void sched_free_group_rcu(struct rcu_head *rcu) +{ + sched_free_group(container_of(rcu, struct task_group, rcu)); +} + +static void sched_unregister_group(struct task_group *tg) +{ + unregister_fair_sched_group(tg); + unregister_rt_sched_group(tg); + /* + * We have to wait for yet another RCU grace period to expire, as + * print_cfs_stats() might run concurrently. + */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) +static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ - sched_free_group(container_of(rhp, struct task_group, rcu)); + sched_unregister_group(container_of(rhp, struct task_group, rcu)); } void sched_destroy_group(struct task_group *tg) { /* Wait for possible concurrent references to cfs_rqs complete: */ - call_rcu(&tg->rcu, sched_free_group_rcu); + call_rcu(&tg->rcu, sched_unregister_group_rcu); } -void sched_offline_group(struct task_group *tg) +void sched_release_group(struct task_group *tg) { unsigned long flags; - /* End participation in shares distribution: */ - unregister_fair_sched_group(tg); - + /* + * Unlink first, to avoid walk_tg_tree_from() from finding us (via + * sched_cfs_period_timer()). + * + * For this to be effective, we have to wait for all pending users of + * this task group to leave their RCU critical section to ensure no new + * user will see our dying task group any more. Specifically ensure + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. + * + * We therefore defer calling unregister_fair_sched_group() to + * sched_unregister_group() which is guarantied to get called only after the + * current RCU grace period has expired. + */ spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); @@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_offline_group(tg); + sched_release_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) @@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) /* * Relies on the RCU grace period between css_released() and this. */ - sched_free_group(tg); + sched_unregister_group(tg); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13950beb01a2..6e476f6d9435 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg) { int i; - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg) struct rq *rq; int cpu; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(cpu) { if (tg->se[cpu]) remove_entity_load_avg(tg->se[cpu]); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bb945f8faeca..b48baaba2fc2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) return rt_rq->rq; } -void free_rt_sched_group(struct task_group *tg) +void unregister_rt_sched_group(struct task_group *tg) { - int i; - if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void unregister_rt_sched_group(struct task_group *tg) { } + void free_rt_sched_group(struct task_group *tg) { } int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7f1612d26c18..0e66749486e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, @@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); -extern void sched_offline_group(struct task_group *tg); +extern void sched_release_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); |