From a130e8fbc7de796eb6e680724d87f4737a26d0ac Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 27 Aug 2021 09:54:38 -0700 Subject: fs/proc/uptime.c: Fix idle time reporting in /proc/uptime /proc/uptime reports idle time by reading the CPUTIME_IDLE field from the per-cpu kcpustats. However, on NO_HZ systems, idle time is not continually updated on idle cpus, leading this value to appear incorrectly small. /proc/stat performs an accounting update when reading idle time; we can use the same approach for uptime. With this patch, /proc/stat and /proc/uptime now agree on idle time. Additionally, the following shows idle time tick up consistently on an idle machine: (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}' Reported-by: Luigi Rizzo Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Link: https://lkml.kernel.org/r/20210827165438.3280779-1-joshdon@google.com --- include/linux/kernel_stat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44ae1a7eb9e3..69ae6b278464 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); +extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) -- cgit v1.2.3 From ceeadb83aea28372e54857bf88ab7e17af48ab7b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:41 +0000 Subject: sched: Make struct sched_statistics independent of fair sched class If we want to use the schedstats facility to trace other sched classes, we should make it independent of fair sched class. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. After the patch, schestats are orgnized as follows, struct task_struct { ... struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; ... struct sched_statistics stats; ... }; Regarding the task group, schedstats is only supported for fair group sched, and a new struct sched_entity_stats is introduced, suggested by Peter - struct sched_entity_stats { struct sched_entity se; struct sched_statistics stats; } __no_randomize_layout; Then with the se in a task_group, we can easily get the stats. The sched_statistics members may be frequently modified when schedstats is enabled, in order to avoid impacting on random data which may in the same cacheline with them, the struct sched_statistics is defined as cacheline aligned. As this patch changes the core struct of scheduler, so I verified the performance it may impact on the scheduler with 'perf bench sched pipe', suggested by Mel. Below is the result, in which all the values are in usecs/op. Before After kernel.sched_schedstats=0 5.2~5.4 5.2~5.4 kernel.sched_schedstats=1 5.3~5.5 5.3~5.5 [These data is a little difference with the earlier version, that is because my old test machine is destroyed so I have to use a new different test machine.] Almost no impact on the sched performance. No functional change. [lkp@intel.com: reported build failure in earlier version] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com --- include/linux/sched.h | 6 ++-- kernel/sched/core.c | 25 +++++++------ kernel/sched/deadline.c | 4 +-- kernel/sched/debug.c | 92 +++++++++++++++++++++++++----------------------- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++------------------- kernel/sched/rt.c | 4 +-- kernel/sched/stats.h | 19 ++++++++++ kernel/sched/stop_task.c | 4 +-- 8 files changed, 143 insertions(+), 100 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a927ddec64..2bc4c72fec2d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -522,7 +522,7 @@ struct sched_statistics { u64 nr_wakeups_passive; u64 nr_wakeups_idle; #endif -}; +} ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ @@ -538,8 +538,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -803,6 +801,8 @@ struct task_struct { struct uclamp_se uclamp[UCLAMP_CNT]; #endif + struct sched_statistics stats; + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ccb604a1bd22..a1741e004eaa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3489,11 +3489,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -3505,14 +3505,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -4196,7 +4196,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif RB_CLEAR_NODE(&p->dl.rb_node); @@ -9553,9 +9553,9 @@ void normalize_rt_tasks(void) continue; p->se.exec_start = 0; - schedstat_set(p->se.statistics.wait_start, 0); - schedstat_set(p->se.statistics.sleep_start, 0); - schedstat_set(p->se.statistics.block_start, 0); + schedstat_set(p->stats.wait_start, 0); + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* @@ -10397,11 +10397,14 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); if (schedstat_enabled() && tg != &root_task_group) { + struct sched_statistics *stats; u64 ws = 0; int i; - for_each_possible_cpu(i) - ws += schedstat_val(tg->se[i]->statistics.wait_sum); + for_each_possible_cpu(i) { + stats = __schedstats_from_se(tg->se[i]); + ws += schedstat_val(stats->wait_sum); + } seq_printf(sf, "wait_sum %llu\n", ws); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e94314633b39..51dd30990042 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1265,8 +1265,8 @@ static void update_curr_dl(struct rq *rq) return; } - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 34913a7f775a..76fd38ea844c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -449,9 +449,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group struct sched_entity *se = tg->se[cpu]; #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ + #F, (long long)schedstat_val(stats->F)) #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \ + #F, SPLIT_NS((long long)schedstat_val(stats->F))) if (!se) return; @@ -461,16 +463,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->sum_exec_runtime); if (schedstat_enabled()) { - PN_SCHEDSTAT(se->statistics.wait_start); - PN_SCHEDSTAT(se->statistics.sleep_start); - PN_SCHEDSTAT(se->statistics.block_start); - PN_SCHEDSTAT(se->statistics.sleep_max); - PN_SCHEDSTAT(se->statistics.block_max); - PN_SCHEDSTAT(se->statistics.exec_max); - PN_SCHEDSTAT(se->statistics.slice_max); - PN_SCHEDSTAT(se->statistics.wait_max); - PN_SCHEDSTAT(se->statistics.wait_sum); - P_SCHEDSTAT(se->statistics.wait_count); + struct sched_statistics *stats = __schedstats_from_se(se); + + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); } P(se->load.weight); @@ -537,9 +541,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -958,8 +962,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "---------------------------------------------------------" "----------\n"); -#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) -#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F)) PN(se.exec_start); PN(se.vruntime); @@ -972,33 +976,33 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); - PN_SCHEDSTAT(se.statistics.wait_start); - PN_SCHEDSTAT(se.statistics.sleep_start); - PN_SCHEDSTAT(se.statistics.block_start); - PN_SCHEDSTAT(se.statistics.sleep_max); - PN_SCHEDSTAT(se.statistics.block_max); - PN_SCHEDSTAT(se.statistics.exec_max); - PN_SCHEDSTAT(se.statistics.slice_max); - PN_SCHEDSTAT(se.statistics.wait_max); - PN_SCHEDSTAT(se.statistics.wait_sum); - P_SCHEDSTAT(se.statistics.wait_count); - PN_SCHEDSTAT(se.statistics.iowait_sum); - P_SCHEDSTAT(se.statistics.iowait_count); - P_SCHEDSTAT(se.statistics.nr_migrations_cold); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); - P_SCHEDSTAT(se.statistics.nr_forced_migrations); - P_SCHEDSTAT(se.statistics.nr_wakeups); - P_SCHEDSTAT(se.statistics.nr_wakeups_sync); - P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); - P_SCHEDSTAT(se.statistics.nr_wakeups_local); - P_SCHEDSTAT(se.statistics.nr_wakeups_remote); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); - P_SCHEDSTAT(se.statistics.nr_wakeups_passive); - P_SCHEDSTAT(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); + PN_SCHEDSTAT(iowait_sum); + P_SCHEDSTAT(iowait_count); + P_SCHEDSTAT(nr_migrations_cold); + P_SCHEDSTAT(nr_failed_migrations_affine); + P_SCHEDSTAT(nr_failed_migrations_running); + P_SCHEDSTAT(nr_failed_migrations_hot); + P_SCHEDSTAT(nr_forced_migrations); + P_SCHEDSTAT(nr_wakeups); + P_SCHEDSTAT(nr_wakeups_sync); + P_SCHEDSTAT(nr_wakeups_migrate); + P_SCHEDSTAT(nr_wakeups_local); + P_SCHEDSTAT(nr_wakeups_remote); + P_SCHEDSTAT(nr_wakeups_affine); + P_SCHEDSTAT(nr_wakeups_affine_attempts); + P_SCHEDSTAT(nr_wakeups_passive); + P_SCHEDSTAT(nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -1064,7 +1068,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61d3e3b97fe5..97c6ecb03bb2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -856,8 +856,13 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->exec_start = now; - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); @@ -885,39 +890,45 @@ static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 wait_start, prev_wait_start; + struct sched_statistics *stats; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + prev_wait_start = schedstat_val(stats->wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - __schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(stats->wait_start, wait_start); } static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; + struct sched_statistics *stats; + struct task_struct *p = NULL; u64 delta; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + /* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario. */ - if (unlikely(!schedstat_val(se->statistics.wait_start))) + if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -927,30 +938,33 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - __schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(stats->wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __schedstat_set(stats->wait_max, + max(schedstat_val(stats->wait_max), delta)); + __schedstat_inc(stats->wait_count); + __schedstat_add(stats->wait_sum, delta); + __schedstat_set(stats->wait_start, 0); } static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); + + sleep_start = schedstat_val(stats->sleep_start); + block_start = schedstat_val(stats->block_start); if (entity_is_task(se)) tsk = task_of(se); @@ -961,11 +975,11 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if ((s64)delta < 0) delta = 0; - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); + if (unlikely(delta > schedstat_val(stats->sleep_max))) + __schedstat_set(stats->sleep_max, delta); - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(stats->sleep_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -978,16 +992,16 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if ((s64)delta < 0) delta = 0; - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); + if (unlikely(delta > schedstat_val(stats->block_max))) + __schedstat_set(stats->block_max, delta); - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(stats->block_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); + __schedstat_add(stats->iowait_sum, delta); + __schedstat_inc(stats->iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1049,10 +1063,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* XXX racy against TTWU */ state = READ_ONCE(tsk->__state); if (state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); if (state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -4530,8 +4544,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - __schedstat_set(se->statistics.slice_max, - max((u64)se->statistics.slice_max, + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime)); } @@ -6046,12 +6063,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->stats.nr_wakeups_affine_attempts); if (target == nr_cpumask_bits) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } @@ -7855,7 +7872,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7889,7 +7906,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -7911,12 +7928,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); + schedstat_inc(p->stats.nr_forced_migrations); } return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -11457,7 +11474,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3daf42a0f462..95a7c3ad2dc3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1009,8 +1009,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index d8f8eb0c655b..fb6022e860af 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -41,6 +41,7 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS: */ + static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } @@ -53,8 +54,26 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_set(var, val) do { } while (0) # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 + #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + #ifdef CONFIG_PSI /* * PSI tracks state that persists across sleeps, such as iowaits and diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f988ebe3febb..0b165a25f22f 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -78,8 +78,8 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit v1.2.3 From 847fc0cd0664fcb2a08ac66df6b85935361ec454 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:43 +0000 Subject: sched: Introduce task block time in schedstats Currently in schedstats we have sum_sleep_runtime and iowait_sum, but there's no metric to show how long the task is in D state. Once a task in D state, it means the task is blocked in the kernel, for example the task may be waiting for a mutex. The D state is more frequent than iowait, and it is more critital than S state. So it is worth to add a metric to measure it. Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-5-laoar.shao@gmail.com --- include/linux/sched.h | 2 ++ kernel/sched/debug.c | 6 ++++-- kernel/sched/stats.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bc4c72fec2d..193e16e2d0e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -503,6 +503,8 @@ struct sched_statistics { u64 block_start; u64 block_max; + s64 sum_block_runtime; + u64 exec_max; u64 slice_max; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 76fd38ea844c..26fac5e28bc0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -540,10 +540,11 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -977,6 +978,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, u64 avg_atom, avg_per_cpu; PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(sum_block_runtime); PN_SCHEDSTAT(wait_start); PN_SCHEDSTAT(sleep_start); PN_SCHEDSTAT(block_start); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index fad781ca7791..07dde2928c79 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -82,6 +82,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, __schedstat_set(stats->block_start, 0); __schedstat_add(stats->sum_sleep_runtime, delta); + __schedstat_add(stats->sum_block_runtime, delta); if (p) { if (p->in_iowait) { -- cgit v1.2.3 From 8d491de6edc27138806cae6e8eca455beb325b62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:32 +0200 Subject: sched: Move mmdrop to RCU on RT mmdrop() is invoked from finish_task_switch() by the incoming task to drop the mm which was handed over by the previous task. mmdrop() can be quite expensive which prevents an incoming real-time task from getting useful work done. Provide mmdrop_sched() which maps to mmdrop() on !RT kernels. On RT kernels it delagates the eventually required invocation of __mmdrop() to RCU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.648582026@linutronix.de --- include/linux/mm_types.h | 4 ++++ include/linux/sched/mm.h | 29 +++++++++++++++++++++++++++++ kernel/sched/core.c | 2 +- 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..e9672de22cf2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -572,6 +573,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..aca874d33fe6 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,35 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +static inline void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95f4e16f98a2..9eaeba671001 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_sched(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) -- cgit v1.2.3 From 9b3c4ab3045e953670c7de9c1165fae5358a7237 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 21:54:32 +0200 Subject: sched,rcu: Rework try_invoke_on_locked_down_task() Give try_invoke_on_locked_down_task() a saner name and have it return an int so that the caller might distinguish between different reasons of failure. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.649944917@infradead.org --- include/linux/wait.h | 3 ++- kernel/rcu/tasks.h | 12 ++++++------ kernel/rcu/tree_stall.h | 8 ++++---- kernel/sched/core.c | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 93dab0e9580f..2d0df57c9902 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1160,6 +1160,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); +typedef int (*task_call_f)(struct task_struct *p, void *arg); +extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 806160c44b17..171bc848e8e3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -928,7 +928,7 @@ reset_ipi: } /* Callback function for scheduler to check locked-down task. */ -static bool trc_inspect_reader(struct task_struct *t, void *arg) +static int trc_inspect_reader(struct task_struct *t, void *arg) { int cpu = task_cpu(t); bool in_qs = false; @@ -939,7 +939,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - return false; + return -EINVAL; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. @@ -947,7 +947,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_attempts++; if (!ofl && // Check for "running" idle tasks on offline CPUs. !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) - return false; // No quiescent state, do it the hard way. + return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; if (ofl) n_heavy_reader_ofl_updates++; @@ -962,7 +962,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) t->trc_reader_checked = true; if (in_qs) - return true; // Already in quiescent state, done!!! + return 0; // Already in quiescent state, done!!! // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit @@ -970,7 +970,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) atomic_inc(&trc_n_readers_need_end); // One more to wait on. WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); - return true; + return 0; } /* Attempt to extract the state for the specified task. */ @@ -992,7 +992,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // Attempt to nail down the task for inspection. get_task_struct(t); - if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + if (!task_call_func(t, trc_inspect_reader, NULL)) { put_task_struct(t); return; } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 677ee3d8671b..5e2fa6fd97f1 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -240,16 +240,16 @@ struct rcu_stall_chk_rdr { * Report out the state of a not-running task that is stalling the * current RCU grace period. */ -static bool check_slow_task(struct task_struct *t, void *arg) +static int check_slow_task(struct task_struct *t, void *arg) { struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) - return false; // It is running, so decline to inspect it. + return -EBUSY; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); - return true; + return 0; } /* @@ -283,7 +283,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); while (i) { t = ts[--i]; - if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + if (task_call_func(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else pr_cont(" P%d/%d:%c%c%c%c", diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a9aeac3469f..74db3c3afa6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4110,7 +4110,7 @@ out: } /** - * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * task_call_func - Invoke a function on task in fixed state * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. @@ -4123,12 +4123,12 @@ out: * Returns: * Whatever @func returns */ -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +int task_call_func(struct task_struct *p, task_call_f func, void *arg) { struct rq *rq = NULL; unsigned int state; struct rq_flags rf; - bool ret = false; + int ret; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); -- cgit v1.2.3 From 5de62ea84abd732ded7c5569426fd71c0420f83e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 22:16:02 +0200 Subject: sched,livepatch: Use wake_up_if_idle() Make sure to prod idle CPUs so they call klp_update_patch_state(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Vasily Gorbik Tested-by: Petr Mladek Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929151723.162004989@infradead.org --- include/linux/sched/idle.h | 4 ++++ kernel/livepatch/transition.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 22873d276be6..d73d314d59c6 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,7 +11,11 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; +#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); +#else +static inline void wake_up_if_idle(int cpu) { } +#endif /* * Idle thread specific functions to determine the need_resched diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 75251e9dbc3c..5683ac0d2566 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -413,8 +413,11 @@ void klp_try_complete_transition(void) for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { - if (!klp_try_switch_task(task)) + if (!klp_try_switch_task(task)) { complete = false; + /* Make idle task go through the main loop. */ + wake_up_if_idle(cpu); + } } else if (task->patch_state != klp_target_state) { /* offline idle tasks can be switched immediately */ clear_tsk_thread_flag(task, TIF_PATCH_PENDING); -- cgit v1.2.3 From 4ef0c5c6b5ba1f38f0ea1cedad0cad722f00c14a Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Wed, 15 Sep 2021 14:40:30 +0800 Subject: kernel/sched: Fix sched_fork() access an invalid sched_task_group There is a small race between copy_process() and sched_fork() where child->sched_task_group point to an already freed pointer. parent doing fork() | someone moving the parent | to another cgroup -------------------------------+------------------------------- copy_process() + dup_task_struct()<1> parent move to another cgroup, and free the old cgroup. <2> + sched_fork() + __set_task_cpu()<3> + task_fork_fair() + sched_slice()<4> In the worst case, this bug can lead to "use-after-free" and cause panic as shown above: (1) parent copy its sched_task_group to child at <1>; (2) someone move the parent to another cgroup and free the old cgroup at <2>; (3) the sched_task_group and cfs_rq that belong to the old cgroup will be accessed at <3> and <4>, which cause a panic: [] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [] PGD 8000001fa0a86067 P4D 8000001fa0a86067 PUD 2029955067 PMD 0 [] Oops: 0000 [#1] SMP PTI [] CPU: 7 PID: 648398 Comm: ebizzy Kdump: loaded Tainted: G OE --------- - - 4.18.0.x86_64+ #1 [] RIP: 0010:sched_slice+0x84/0xc0 [] Call Trace: [] task_fork_fair+0x81/0x120 [] sched_fork+0x132/0x240 [] copy_process.part.5+0x675/0x20e0 [] ? __handle_mm_fault+0x63f/0x690 [] _do_fork+0xcd/0x3b0 [] do_syscall_64+0x5d/0x1d0 [] entry_SYSCALL_64_after_hwframe+0x65/0xca [] RIP: 0033:0x7f04418cd7e1 Between cgroup_can_fork() and cgroup_post_fork(), the cgroup membership and thus sched_task_group can't change. So update child's sched_task_group at sched_post_fork() and move task_fork() and __set_task_cpu() (where accees the sched_task_group) from sched_fork() to sched_post_fork(). Fixes: 8323f26ce342 ("sched: Fix race in task_group") Signed-off-by: Zhang Qiao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lkml.kernel.org/r/20210915064030.2231-1-zhangqiao22@huawei.com --- include/linux/sched/task.h | 3 ++- kernel/fork.c | 2 +- kernel/sched/core.c | 43 ++++++++++++++++++++++--------------------- 3 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ef02be869cf2..ba88a6987400 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,7 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p); +extern void sched_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index 38681ad44c76..0e4251dc5436 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2405,7 +2405,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - sched_post_fork(p); + sched_post_fork(p, args); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b55ef99c03f..935c2da00339 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4343,8 +4343,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { - unsigned long flags; - __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4390,24 +4388,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - rseq_migrate(p); - /* - * We're setting the CPU for the first time, we don't migrate, - * so use __set_task_cpu(). - */ - __set_task_cpu(p, smp_processor_id()); - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4423,8 +4403,29 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p) +void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + unsigned long flags; +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; +#endif + + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + p->sched_task_group = autogroup_task_group(p, tg); +#endif + rseq_migrate(p); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, smp_processor_id()); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + uclamp_post_fork(p); } -- cgit v1.2.3 From 804bccba71a57e7e5deb507a4c8ebbab730909c0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Sep 2021 19:54:50 -0700 Subject: sched: Fill unconditional hole induced by sched_entity With struct sched_entity before the other sched entities, its alignment won't induce a struct hole. This saves 64 bytes in defconfig task_struct: Before: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ const struct sched_class * sched_class; /* 128 8 */ /* XXX 56 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 192 448 */ /* --- cacheline 10 boundary (640 bytes) --- */ struct sched_rt_entity rt; /* 640 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 688 224 */ /* --- cacheline 14 boundary (896 bytes) was 16 bytes ago --- */ After: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 128 448 */ /* --- cacheline 9 boundary (576 bytes) --- */ struct sched_rt_entity rt; /* 576 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 624 224 */ /* --- cacheline 13 boundary (832 bytes) was 16 bytes ago --- */ Summary diff: - /* size: 7040, cachelines: 110, members: 188 */ + /* size: 6976, cachelines: 109, members: 188 */ Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210924025450.4138503-1-keescook@chromium.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 193e16e2d0e4..343603f77f8b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,10 +775,10 @@ struct task_struct { int normal_prio; unsigned int rt_priority; - const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; -- cgit v1.2.3 From 42a20f86dc19f9282d974df0ba4d226c865ab9dd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:14 -0700 Subject: sched: Add wrapper for get_wchan() to keep task blocked Having a stable wchan means the process must be blocked and for it to stay that way while performing stack unwinding. Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Acked-by: Russell King (Oracle) [arm] Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/20211008111626.332092234@infradead.org --- arch/alpha/include/asm/processor.h | 2 +- arch/alpha/kernel/process.c | 5 ++--- arch/arc/include/asm/processor.h | 2 +- arch/arc/kernel/stacktrace.c | 4 ++-- arch/arm/include/asm/processor.h | 2 +- arch/arm/kernel/process.c | 4 +--- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/kernel/process.c | 4 +--- arch/csky/include/asm/processor.h | 2 +- arch/csky/kernel/stacktrace.c | 5 ++--- arch/h8300/include/asm/processor.h | 2 +- arch/h8300/kernel/process.c | 5 +---- arch/hexagon/include/asm/processor.h | 2 +- arch/hexagon/kernel/process.c | 4 +--- arch/ia64/include/asm/processor.h | 2 +- arch/ia64/kernel/process.c | 5 +---- arch/m68k/include/asm/processor.h | 2 +- arch/m68k/kernel/process.c | 4 +--- arch/microblaze/include/asm/processor.h | 2 +- arch/microblaze/kernel/process.c | 2 +- arch/mips/include/asm/processor.h | 2 +- arch/mips/kernel/process.c | 8 +++----- arch/nds32/include/asm/processor.h | 2 +- arch/nds32/kernel/process.c | 7 +------ arch/nios2/include/asm/processor.h | 2 +- arch/nios2/kernel/process.c | 5 +---- arch/openrisc/include/asm/processor.h | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/include/asm/processor.h | 2 +- arch/parisc/kernel/process.c | 5 +---- arch/powerpc/include/asm/processor.h | 2 +- arch/powerpc/kernel/process.c | 9 +++------ arch/riscv/include/asm/processor.h | 2 +- arch/riscv/kernel/stacktrace.c | 12 +++++------- arch/s390/include/asm/processor.h | 2 +- arch/s390/kernel/process.c | 4 ++-- arch/sh/include/asm/processor_32.h | 2 +- arch/sh/kernel/process_32.c | 5 +---- arch/sparc/include/asm/processor_32.h | 2 +- arch/sparc/include/asm/processor_64.h | 2 +- arch/sparc/kernel/process_32.c | 5 +---- arch/sparc/kernel/process_64.c | 5 +---- arch/um/include/asm/processor-generic.h | 2 +- arch/um/kernel/process.c | 5 +---- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/process.c | 5 +---- arch/xtensa/include/asm/processor.h | 2 +- arch/xtensa/kernel/process.c | 5 +---- include/linux/sched.h | 1 + kernel/sched/core.c | 19 +++++++++++++++++++ 50 files changed, 80 insertions(+), 112 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 6100431da07a..090499c99c1c 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -42,7 +42,7 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); struct task_struct; extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index a5123ea426ce..5f8527081da9 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -376,12 +376,11 @@ thread_saved_pc(struct task_struct *t) } unsigned long -get_wchan(struct task_struct *p) +__get_wchan(struct task_struct *p) { unsigned long schedule_frame; unsigned long pc; - if (!p || p == current || task_is_running(p)) - return 0; + /* * This one depends on the frame size of schedule(). Do a * "disass schedule" in gdb to find the frame size. Also, the diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index f28afcf5c6d1..54db9d7bb562 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -70,7 +70,7 @@ struct task_struct; extern void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long usp); -extern unsigned int get_wchan(struct task_struct *p); +extern unsigned int __get_wchan(struct task_struct *p); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index c376ff3147e7..5372dc04e784 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -15,7 +15,7 @@ * = specifics of data structs where trace is saved(CONFIG_STACKTRACE etc) * * vineetg: March 2009 - * -Implemented correct versions of thread_saved_pc() and get_wchan() + * -Implemented correct versions of thread_saved_pc() and __get_wchan() * * rajeshwarr: 2008 * -Initial implementation @@ -248,7 +248,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) * Of course just returning schedule( ) would be pointless so unwind until * the function is not in schedular code */ -unsigned int get_wchan(struct task_struct *tsk) +unsigned int __get_wchan(struct task_struct *tsk) { return arc_unwind_core(tsk, NULL, __get_first_nonsched, NULL); } diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 9e6b97286307..6af68edfa53a 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -84,7 +84,7 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0e2d3051741e..96f577e59595 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -276,13 +276,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct stackframe frame; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ee2bdc1b9f5b..55ca034238ea 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -257,7 +257,7 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 40adb8cdbf5a..aacf2f5559a8 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -528,13 +528,11 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, return last; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct stackframe frame; unsigned long stack_page, ret = 0; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)try_get_task_stack(p); if (!stack_page) diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9e933021fe8e..817dd60ff152 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -81,7 +81,7 @@ static inline void release_thread(struct task_struct *dead_task) extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->usp) diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index 1b280ef08004..9f78f5d21511 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -111,12 +111,11 @@ static bool save_wchan(unsigned long pc, void *arg) return false; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && !task_is_running(task))) - walk_stackframe(task, NULL, save_wchan, &pc); + walk_stackframe(task, NULL, save_wchan, &pc); return pc; } diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index a060b41b2d31..141a23eb62b7 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -105,7 +105,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ ({ \ diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 2ac27e4248a4..8833fa4f5d51 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -128,15 +128,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - stack_page = (unsigned long)p; fp = ((struct pt_regs *)p->thread.ksp)->er6; do { diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 9f0cc99420be..615f7e49968e 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -64,7 +64,7 @@ struct thread_struct { extern void release_thread(struct task_struct *dead_task); /* Get wait channel for task P. */ -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 6a6835fb4242..232dfd8956aa 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -130,13 +130,11 @@ void flush_thread(void) * is an identification of the point at which the scheduler * was invoked by a blocked thread. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)task_stack_page(p); fp = ((struct hexagon_switch_stack *)p->thread.switch_sp)->fp; diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 2d8bcdc27d7f..45365c2ef598 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -330,7 +330,7 @@ struct task_struct; #define release_thread(dead_task) /* Get wait channel for task P. */ -extern unsigned long get_wchan (struct task_struct *p); +extern unsigned long __get_wchan (struct task_struct *p); /* Return instruction pointer of blocked task TSK. */ #define KSTK_EIP(tsk) \ diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index e56d63f4abf9..834df24a88f1 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -523,15 +523,12 @@ exit_thread (struct task_struct *tsk) } unsigned long -get_wchan (struct task_struct *p) +__get_wchan (struct task_struct *p) { struct unw_frame_info info; unsigned long ip; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - /* * Note: p may not be a blocked task (it could be current or * another process running on some other CPU. Rather than diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index f4d82c619a5c..ffeda9aa526a 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -150,7 +150,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ ({ \ diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 1ab692b952cd..a6030dbaa089 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -263,13 +263,11 @@ int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu) } EXPORT_SYMBOL(dump_fpu); -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)task_stack_page(p); fp = ((struct switch_stack *)p->thread.ksp)->a6; diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 06c6e493590a..7e9e92670df3 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -68,7 +68,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ # define KERNEL_STACK_SIZE 0x2000 diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 62aa237180b6..5e2b91c1e8ce 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -112,7 +112,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { /* TBD (used by procfs) */ return 0; diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 0c3550c82b72..252ed38ce8c5 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -369,7 +369,7 @@ static inline void flush_thread(void) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \ THREAD_SIZE - 32 - sizeof(struct pt_regs)) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 95aa86fa6077..cbff1b974f88 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -511,7 +511,7 @@ static int __init frame_info_init(void) /* * Without schedule() frame info, result given by - * thread_saved_pc() and get_wchan() are not reliable. + * thread_saved_pc() and __get_wchan() are not reliable. */ if (schedule_mfi.pc_offset < 0) printk("Can't analyze schedule() prologue at %p\n", schedule); @@ -652,9 +652,9 @@ unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, #endif /* - * get_wchan - a maintenance nightmare^W^Wpain in the ass ... + * __get_wchan - a maintenance nightmare^W^Wpain in the ass ... */ -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; #ifdef CONFIG_KALLSYMS @@ -662,8 +662,6 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ra = 0; #endif - if (!task || task == current || task_is_running(task)) - goto out; if (!task_stack_page(task)) goto out; diff --git a/arch/nds32/include/asm/processor.h b/arch/nds32/include/asm/processor.h index b82369c7659d..e6bfc74972bb 100644 --- a/arch/nds32/include/asm/processor.h +++ b/arch/nds32/include/asm/processor.h @@ -83,7 +83,7 @@ extern struct task_struct *last_task_used_math; /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index 391895b54d13..49fab9e39cbf 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -233,15 +233,12 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu) EXPORT_SYMBOL(dump_fpu); -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, lr; unsigned long stack_start, stack_end; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - if (IS_ENABLED(CONFIG_FRAME_POINTER)) { stack_start = (unsigned long)end_of_stack(p); stack_end = (unsigned long)task_stack_page(p) + THREAD_SIZE; @@ -258,5 +255,3 @@ unsigned long get_wchan(struct task_struct *p) } return 0; } - -EXPORT_SYMBOL(get_wchan); diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index 94bcb86f679f..b8125dfbcad2 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -69,7 +69,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 9ff37ba2bb60..f8ea522a1588 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -217,15 +217,12 @@ void dump(struct pt_regs *fp) pr_emerg("\n\n"); } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - stack_page = (unsigned long)p; fp = ((struct switch_stack *)p->thread.ksp)->fp; /* ;dgt2 */ do { diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index ad53b3184885..aa1699c18add 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -73,7 +73,7 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index b0698d9ce14f..3c0c91bcdcba 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -263,7 +263,7 @@ void dump_elf_thread(elf_greg_t *dest, struct pt_regs* regs) dest[35] = 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { /* TODO */ diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index eeb7da064289..85a2dbfe5278 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -273,7 +273,7 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) #define KSTK_ESP(tsk) ((tsk)->thread.regs.gr[30]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 38ec4ae81239..4f562dee65a2 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -240,15 +240,12 @@ copy_thread(unsigned long clone_flags, unsigned long usp, } unsigned long -get_wchan(struct task_struct *p) +__get_wchan(struct task_struct *p) { struct unwind_frame_info info; unsigned long ip; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - /* * These bracket the sleeping functions.. */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index f348e564f7dd..e39bd0ff69f3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -300,7 +300,7 @@ struct thread_struct { #define task_pt_regs(tsk) ((tsk)->thread.regs) -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs? (tsk)->thread.regs->nip: 0) #define KSTK_ESP(tsk) ((tsk)->thread.regs? (tsk)->thread.regs->gpr[1]: 0) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 50436b52c213..406d7ee9e322 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2111,14 +2111,11 @@ int validate_sp(unsigned long sp, struct task_struct *p, EXPORT_SYMBOL(validate_sp); -static unsigned long __get_wchan(struct task_struct *p) +static unsigned long ___get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - sp = p->thread.ksp; if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) return 0; @@ -2137,14 +2134,14 @@ static unsigned long __get_wchan(struct task_struct *p) return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long ret; if (!try_get_task_stack(p)) return 0; - ret = __get_wchan(p); + ret = ___get_wchan(p); put_task_stack(p); diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 46b492c78cbb..0749924d9e55 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -66,7 +66,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); static inline void wait_for_interrupt(void) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 315db3d0229b..0fcdc0233fac 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -128,16 +128,14 @@ static bool save_wchan(void *arg, unsigned long pc) return true; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && !task_is_running(task))) { - if (!try_get_task_stack(task)) - return 0; - walk_stackframe(task, NULL, save_wchan, &pc); - put_task_stack(task); - } + if (!try_get_task_stack(task)) + return 0; + walk_stackframe(task, NULL, save_wchan, &pc); + put_task_stack(task); return pc; } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 879b8e3f609c..f54c152bf2bf 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -192,7 +192,7 @@ static inline void release_thread(struct task_struct *tsk) { } void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ (task_stack_page(tsk) + THREAD_SIZE) - 1) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 350e94d0cac2..e5dd46b1bff8 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -181,12 +181,12 @@ void execve_tail(void) asm volatile("sfpc %0" : : "d" (0)); } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long ip = 0; - if (!p || p == current || task_is_running(p) || !task_stack_page(p)) + if (!task_stack_page(p)) return 0; if (!try_get_task_stack(p)) diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index aa92cc933889..45240ec6b85a 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -180,7 +180,7 @@ static inline void show_code(struct pt_regs *regs) } #endif -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->regs[15]) diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 717de05c81f4..1c28e3cddb60 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -182,13 +182,10 @@ __switch_to(struct task_struct *prev, struct task_struct *next) return prev; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long pc; - if (!p || p == current || task_is_running(p)) - return 0; - /* * The same comment as on the Alpha applies here, too ... */ diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b6242f7771e9..647bf0ac7beb 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -89,7 +89,7 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Free all resources held by a thread. */ #define release_thread(tsk) do { } while(0) -unsigned long get_wchan(struct task_struct *); +unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) #define KSTK_EIP(tsk) ((tsk)->thread.kregs->pc) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 5cf145f18f36..ae851e8fce4c 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -183,7 +183,7 @@ do { \ /* Free all resources held by a thread. */ #define release_thread(tsk) do { } while (0) -unsigned long get_wchan(struct task_struct *task); +unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->tpc) diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index bbbe0cfef746..2dc0bf9fe62e 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -365,7 +365,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, return 0; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; unsigned long task_base = (unsigned long) task; @@ -373,9 +373,6 @@ unsigned long get_wchan(struct task_struct *task) struct reg_window32 *rw; int count = 0; - if (!task || task == current || task_is_running(task)) - goto out; - fp = task_thread_info(task)->ksp + bias; do { /* Bogus frame pointer? */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index d1cc410d2f64..f5b2cac8669f 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -663,7 +663,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; struct thread_info *tp; @@ -671,9 +671,6 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ret = 0; int count = 0; - if (!task || task == current || task_is_running(task)) - goto out; - tp = task_thread_info(task); bias = STACK_BIAS; fp = task_thread_info(task)->ksp + bias; diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index b5cf0ed116d9..579692a40a55 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -106,6 +106,6 @@ extern struct cpuinfo_um boot_cpu_data; #define cache_line_size() (boot_cpu_data.cache_alignment) #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #endif diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 457a38db368b..82107373ac7e 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -364,14 +364,11 @@ unsigned long arch_align_stack(unsigned long sp) } #endif -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long stack_page, sp, ip; bool seen_sched = 0; - if ((p == NULL) || (p == current) || task_is_running(p)) - return 0; - stack_page = (unsigned long) task_stack_page(p); /* Bail if the process has no kernel stack for some reason */ if (stack_page == 0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9ad2acaaae9b..e81177edc681 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -589,7 +589,7 @@ static inline void load_sp0(unsigned long sp0) /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* * Generic CPUID function diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e645925f9f02..a69885a9d711 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -942,13 +942,10 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long entry = 0; - if (p == current || task_is_running(p)) - return 0; - stack_trace_save_tsk(p, &entry, 1, 0); return entry; } diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 7f63aca6a0d3..ad15fbc57283 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -215,7 +215,7 @@ struct mm_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->areg[1]) diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 060165340612..47f933fed870 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -298,15 +298,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, * These bracket the sleeping functions.. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long sp, pc; unsigned long stack_page = (unsigned long) task_stack_page(p); int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - sp = p->thread.sp; pc = MAKE_PC_FROM_RA(p->thread.ra, p->thread.sp); diff --git a/include/linux/sched.h b/include/linux/sched.h index 343603f77f8b..5f8db54226af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); +extern unsigned long get_wchan(struct task_struct *p); /* * In order to reduce various lock holder preemption latencies provide an diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 935c2da00339..f2611b9cf503 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1966,6 +1966,25 @@ bool sched_task_on_rq(struct task_struct *p) return task_on_rq_queued(p); } +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ip = 0; + unsigned int state; + + if (!p || p == current) + return 0; + + /* Only get wchan if task is blocked and we can keep it that way. */ + raw_spin_lock_irq(&p->pi_lock); + state = READ_ONCE(p->__state); + smp_rmb(); /* see try_to_wake_up() */ + if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) + ip = __get_wchan(p); + raw_spin_unlock_irq(&p->pi_lock); + + return ip; +} + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) -- cgit v1.2.3 From c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 24 Sep 2021 20:51:02 +1200 Subject: topology: Represent clusters of CPUs within a die Both ACPI and DT provide the ability to describe additional layers of topology between that of individual cores and higher level constructs such as the level at which the last level cache is shared. In ACPI this can be represented in PPTT as a Processor Hierarchy Node Structure [1] that is the parent of the CPU cores and in turn has a parent Processor Hierarchy Nodes Structure representing a higher level of topology. For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each cluster has 4 cpus. All clusters share L3 cache data, but each cluster has local L3 tag. On the other hand, each clusters will share some internal system bus. +-----------------------------------+ +---------+ | +------+ +------+ +--------------------------+ | | | CPU0 | | cpu1 | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ cluster | | tag | | | | | CPU2 | | CPU3 | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +----+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | L3 | | data | +-----------------------------------+ | | | +------+ +------+ | +-----------+ | | | | | | | | | | | | | +------+ +------+ +----+ L3 | | | | | | tag | | | | +------+ +------+ | | | | | | | | | | | +-----------+ | | | +------+ +------+ +--------------------------+ | +-----------------------------------| | | +-----------------------------------| | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ | | tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +---+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +--+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | +---------+ +-----------------------------------+ That means spreading tasks among clusters will bring more bandwidth while packing tasks within one cluster will lead to smaller cache synchronization latency. So both kernel and userspace will have a chance to leverage this topology to deploy tasks accordingly to achieve either smaller cache latency within one cluster or an even distribution of load among clusters for higher throughput. This patch exposes cluster topology to both kernel and userspace. Libraried like hwloc will know cluster by cluster_cpus and related sysfs attributes. PoC of HWLOC support at [2]. Note this patch only handle the ACPI case. Special consideration is needed for SMT processors, where it is necessary to move 2 levels up the hierarchy from the leaf nodes (thus skipping the processor core level). Note that arm64 / ACPI does not provide any means of identifying a die level in the topology but that may be unrelate to the cluster level. [1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node structure (Type 0) [2] https://github.com/hisilicon/hwloc/tree/linux-cluster Signed-off-by: Jonathan Cameron Signed-off-by: Tian Tao Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com --- Documentation/ABI/stable/sysfs-devices-system-cpu | 15 +++++ Documentation/admin-guide/cputopology.rst | 12 ++-- arch/arm64/kernel/topology.c | 2 + drivers/acpi/pptt.c | 67 +++++++++++++++++++++++ drivers/base/arch_topology.c | 15 +++++ drivers/base/topology.c | 10 ++++ include/linux/acpi.h | 5 ++ include/linux/arch_topology.h | 5 ++ include/linux/topology.h | 6 ++ 9 files changed, 133 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/stable/sysfs-devices-system-cpu b/Documentation/ABI/stable/sysfs-devices-system-cpu index 516dafea03eb..3965ce504484 100644 --- a/Documentation/ABI/stable/sysfs-devices-system-cpu +++ b/Documentation/ABI/stable/sysfs-devices-system-cpu @@ -42,6 +42,12 @@ Description: the CPU core ID of cpuX. Typically it is the hardware platform's architecture and platform dependent. Values: integer +What: /sys/devices/system/cpu/cpuX/topology/cluster_id +Description: the cluster ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. +Values: integer + What: /sys/devices/system/cpu/cpuX/topology/book_id Description: the book ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is @@ -85,6 +91,15 @@ Description: human-readable list of CPUs within the same die. The format is like 0-3, 8-11, 14,17. Values: decimal list. +What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus +Description: internal kernel map of CPUs within the same cluster. +Values: hexadecimal bitmask. + +What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus_list +Description: human-readable list of CPUs within the same cluster. + The format is like 0-3, 8-11, 14,17. +Values: decimal list. + What: /sys/devices/system/cpu/cpuX/topology/book_siblings Description: internal kernel map of cpuX's hardware threads within the same book_id. it's only used on s390. diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst index b085dbac60a5..6b62e182baf4 100644 --- a/Documentation/admin-guide/cputopology.rst +++ b/Documentation/admin-guide/cputopology.rst @@ -19,11 +19,13 @@ these macros in include/asm-XXX/topology.h:: #define topology_physical_package_id(cpu) #define topology_die_id(cpu) + #define topology_cluster_id(cpu) #define topology_core_id(cpu) #define topology_book_id(cpu) #define topology_drawer_id(cpu) #define topology_sibling_cpumask(cpu) #define topology_core_cpumask(cpu) + #define topology_cluster_cpumask(cpu) #define topology_die_cpumask(cpu) #define topology_book_cpumask(cpu) #define topology_drawer_cpumask(cpu) @@ -39,10 +41,12 @@ not defined by include/asm-XXX/topology.h: 1) topology_physical_package_id: -1 2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU +3) topology_cluster_id: -1 +4) topology_core_id: 0 +5) topology_sibling_cpumask: just the given CPU +6) topology_core_cpumask: just the given CPU +7) topology_cluster_cpumask: just the given CPU +8) topology_die_cpumask: just the given CPU For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4dd14a6620c1..9ab78ad826e2 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -103,6 +103,8 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; topology_id = find_acpi_cpu_topology_package(cpu); cpu_topology[cpu].package_id = topology_id; diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index fe69dc518f31..701f61c01359 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -746,6 +746,73 @@ int find_acpi_cpu_topology_package(unsigned int cpu) ACPI_PPTT_PHYSICAL_PACKAGE); } +/** + * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value + * @cpu: Kernel logical CPU number + * + * Determine a topology unique cluster ID for the given CPU/thread. + * This ID can then be used to group peers, which will have matching ids. + * + * The cluster, if present is the level of topology above CPUs. In a + * multi-thread CPU, it will be the level above the CPU, not the thread. + * It may not exist in single CPU systems. In simple multi-CPU systems, + * it may be equal to the package topology level. + * + * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found + * or there is no toplogy level above the CPU.. + * Otherwise returns a value which represents the package for this CPU. + */ + +int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + struct acpi_table_header *table; + acpi_status status; + struct acpi_pptt_processor *cpu_node, *cluster_node; + u32 acpi_cpu_id; + int retval; + int is_thread; + + status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); + if (ACPI_FAILURE(status)) { + acpi_pptt_warn_missing(); + return -ENOENT; + } + + acpi_cpu_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (cpu_node == NULL || !cpu_node->parent) { + retval = -ENOENT; + goto put_table; + } + + is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; + cluster_node = fetch_pptt_node(table, cpu_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + if (is_thread) { + if (!cluster_node->parent) { + retval = -ENOENT; + goto put_table; + } + cluster_node = fetch_pptt_node(table, cluster_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + } + if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) + retval = cluster_node->acpi_processor_id; + else + retval = ACPI_PTR_DIFF(cluster_node, table); + +put_table: + acpi_put_table(table); + + return retval; +} + /** * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag * @cpu: Kernel logical CPU number diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 43407665918f..fc0836f460fb 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -600,6 +600,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return core_mask; } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return &cpu_topology[cpu].cluster_sibling; +} + void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -617,6 +622,12 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if (cpuid_topo->cluster_id == cpu_topo->cluster_id && + cpuid_topo->cluster_id != -1) { + cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); + cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); + } + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); @@ -635,6 +646,9 @@ static void clear_cpu_topology(int cpu) cpumask_clear(&cpu_topo->llc_sibling); cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + cpumask_clear(&cpu_topo->cluster_sibling); + cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); + cpumask_clear(&cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); @@ -650,6 +664,7 @@ void __init reset_cpu_topology(void) cpu_topo->thread_id = -1; cpu_topo->core_id = -1; + cpu_topo->cluster_id = -1; cpu_topo->package_id = -1; cpu_topo->llc_id = -1; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 43c0940643f5..8f2b641d0b8c 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -48,6 +48,9 @@ static DEVICE_ATTR_RO(physical_package_id); define_id_show_func(die_id); static DEVICE_ATTR_RO(die_id); +define_id_show_func(cluster_id); +static DEVICE_ATTR_RO(cluster_id); + define_id_show_func(core_id); static DEVICE_ATTR_RO(core_id); @@ -63,6 +66,10 @@ define_siblings_read_func(core_siblings, core_cpumask); static BIN_ATTR_RO(core_siblings, 0); static BIN_ATTR_RO(core_siblings_list, 0); +define_siblings_read_func(cluster_cpus, cluster_cpumask); +static BIN_ATTR_RO(cluster_cpus, 0); +static BIN_ATTR_RO(cluster_cpus_list, 0); + define_siblings_read_func(die_cpus, die_cpumask); static BIN_ATTR_RO(die_cpus, 0); static BIN_ATTR_RO(die_cpus_list, 0); @@ -94,6 +101,8 @@ static struct bin_attribute *bin_attrs[] = { &bin_attr_thread_siblings_list, &bin_attr_core_siblings, &bin_attr_core_siblings_list, + &bin_attr_cluster_cpus, + &bin_attr_cluster_cpus_list, &bin_attr_die_cpus, &bin_attr_die_cpus_list, &bin_attr_package_cpus, @@ -112,6 +121,7 @@ static struct bin_attribute *bin_attrs[] = { static struct attribute *default_attrs[] = { &dev_attr_physical_package_id.attr, &dev_attr_die_id.attr, + &dev_attr_cluster_id.attr, &dev_attr_core_id.attr, #ifdef CONFIG_SCHED_BOOK &dev_attr_book_id.attr, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..fbc2146050a4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1353,6 +1353,7 @@ static inline int lpit_read_residency_count_address(u64 *address) #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); +int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); int find_acpi_cpu_cache_topology(unsigned int cpu, int level); @@ -1365,6 +1366,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } +static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + return -EINVAL; +} static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index f180240dc95f..b97cea83b25e 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -62,10 +62,12 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, struct cpu_topology { int thread_id; int core_id; + int cluster_id; int package_id; int llc_id; cpumask_t thread_sibling; cpumask_t core_sibling; + cpumask_t cluster_sibling; cpumask_t llc_sibling; }; @@ -73,13 +75,16 @@ struct cpu_topology { extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) +#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) #define topology_core_id(cpu) (cpu_topology[cpu].core_id) #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) +#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +const struct cpumask *cpu_clustergroup_mask(int cpu); void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); diff --git a/include/linux/topology.h b/include/linux/topology.h index 7634cd737061..80d27d717631 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -186,6 +186,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif +#ifndef topology_cluster_id +#define topology_cluster_id(cpu) ((void)(cpu), -1) +#endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif @@ -195,6 +198,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifndef topology_cluster_cpumask +#define topology_cluster_cpumask(cpu) cpumask_of(cpu) +#endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif -- cgit v1.2.3 From 778c558f49a2cb3dc7b18a80ff515e82aa813627 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 Sep 2021 20:51:03 +1200 Subject: sched: Add cluster scheduler level in core and related Kconfig for ARM64 This patch adds scheduler level for clusters and automatically enables the load balance among clusters. It will directly benefit a lot of workload which loves more resources such as memory bandwidth, caches. Testing has widely been done in two different hardware configurations of Kunpeng920: 24 cores in one NUMA(6 clusters in each NUMA node); 32 cores in one NUMA(8 clusters in each NUMA node) Workload is running on either one NUMA node or four NUMA nodes, thus, this can estimate the effect of cluster spreading w/ and w/o NUMA load balance. * Stream benchmark: 4threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%) MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%) MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%) MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%) 6threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%) MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%) MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%) MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%) 12threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%) MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%) MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%) MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%) Thus, it could help memory-bound workload especially under medium load. Similar improvement is also seen in lkp-pbzip2: * lkp-pbzip2 benchmark 2-96 threads (on 4NUMA * 24cores = 96cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%* Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%* Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%* Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%* Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%* Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%* Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%* Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%* Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%* 2-24 threads (on 1NUMA * 24cores = 24cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%* Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%* Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%* Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%* Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%* Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%) Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%* In the case of 6 threads and 8 threads, we see the greatest performance improvement. Similar improvement can be seen on lkp-pixz though the improvement is smaller: * lkp-pixz benchmark 2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores) lkp-pixz lkp-pixz w/o patch w/ patch Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%* Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%) Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%* Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%* Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%* Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%* Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%* * SPECrate benchmark 4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores) Base Base Run Time Rate ------- --------- 4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3) 8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%) 16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6) 32 Copies(on 4NUMA * 32 cores = 128cores) [w/o patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 584 87.2 * 502.gcc_r 32 503 90.2 * 505.mcf_r 32 745 69.4 * 520.omnetpp_r 32 1031 40.7 * 523.xalancbmk_r 32 597 56.6 * 525.x264_r 1 -- CE 531.deepsjeng_r 32 336 109 * 541.leela_r 32 556 95.4 * 548.exchange2_r 32 513 163 * 557.xz_r 32 530 65.2 * Est. SPECrate2017_int_base 80.3 [w/ patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 580 87.8 (+0.688%) * 502.gcc_r 32 477 95.1 (+5.432%) * 505.mcf_r 32 644 80.3 (+13.574%) * 520.omnetpp_r 32 942 44.6 (+9.58%) * 523.xalancbmk_r 32 560 60.4 (+6.714%%) * 525.x264_r 1 -- CE 531.deepsjeng_r 32 337 109 (+0.000%) * 541.leela_r 32 554 95.6 (+0.210%) * 548.exchange2_r 32 515 163 (+0.000%) * 557.xz_r 32 524 66.0 (+1.227%) * Est. SPECrate2017_int_base 83.7 (+4.062%) On the other hand, it is slightly helpful to CPU-bound tasks like kernbench: * 24-96 threads kernbench (on 4NUMA * 24cores = 96cores) kernbench kernbench w/o cluster w/ cluster Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%) Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%) Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%) Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%) Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%) Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%) Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%) Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%) Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%) Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%* Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%* Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%) Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%* Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%* Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%* Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%) Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%* Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%) Note this patch isn't an universal win, it might hurt those workload which can benefit from packing. Though tasks which want to take advantages of lower communication latency of one cluster won't necessarily been packed in one cluster while kernel is not aware of clusters, they have some chance to be randomly packed. But this patch will make them more likely spread. Signed-off-by: Barry Song Tested-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) --- arch/arm64/Kconfig | 9 +++++++++ include/linux/sched/topology.h | 7 +++++++ include/linux/topology.h | 7 +++++++ kernel/sched/topology.c | 5 +++++ 4 files changed, 28 insertions(+) (limited to 'include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c7ae4c3954b..d13677f4731d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -989,6 +989,15 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_CLUSTER + bool "Cluster scheduler support" + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT bool "SMT scheduler support" help diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 8f0f778b7c91..2f9166f6dec8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -42,6 +42,13 @@ static inline int cpu_smt_flags(void) } #endif +#ifdef CONFIG_SCHED_CLUSTER +static inline int cpu_cluster_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { diff --git a/include/linux/topology.h b/include/linux/topology.h index 80d27d717631..0b3704ad13c8 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -212,6 +212,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) +static inline const struct cpumask *cpu_cluster_mask(int cpu) +{ + return topology_cluster_cpumask(cpu); +} +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5af3edd34d6d..c1729f9a715f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1638,6 +1638,11 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif -- cgit v1.2.3 From 810979682ccc98dbd83f341c18a2e556c30a7164 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:50 +0200 Subject: irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support. irq_work() triggers instantly an interrupt if supported by the architecture. Otherwise the work will be processed on the next timer tick. In worst case irq_work_sync() could spin up to a jiffy. irq_work_sync() is usually used in tear down context which is fully preemptible. Based on review irq_work_sync() is invoked from preemptible context and there is one waiter at a time. This qualifies it to use rcuwait for synchronisation. Let irq_work_sync() synchronize with rcuwait if the architecture processes irqwork via the timer tick. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-3-bigeasy@linutronix.de --- include/linux/irq_work.h | 3 +++ kernel/irq_work.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index ec2a47a81e42..b48955e9c920 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -3,6 +3,7 @@ #define _LINUX_IRQ_WORK_H #include +#include /* * An entry can be in one of four states: @@ -16,11 +17,13 @@ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index db8c248ebc8c..e789beda8297 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -160,6 +160,9 @@ void irq_work_single(void *arg) * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); + + if (!arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) @@ -204,6 +207,13 @@ void irq_work_tick(void) void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); + might_sleep(); + + if (!arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; + } while (irq_work_is_busy(work)) cpu_relax(); -- cgit v1.2.3 From 09089db79859cbccccd8df95b034f36f7027efa6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:52 +0200 Subject: irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on PREEMPT_RT On PREEMPT_RT most items are processed as LAZY via softirq context. Avoid to spin-wait for them because irq_work_sync() could have higher priority and not allow the irq-work to be completed. Wait additionally for !IRQ_WORK_HARD_IRQ irq_work items on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-5-bigeasy@linutronix.de --- include/linux/irq_work.h | 5 +++++ kernel/irq_work.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index b48955e9c920..8cd11a223260 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -49,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90b6b56f92e9..f7df715ec28e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -217,7 +217,8 @@ void irq_work_single(void *arg) */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); - if (!arch_irq_work_has_interrupt()) + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } @@ -277,7 +278,8 @@ void irq_work_sync(struct irq_work *work) lockdep_assert_irqs_enabled(); might_sleep(); - if (!arch_irq_work_has_interrupt()) { + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; -- cgit v1.2.3 From e60b56e46b384cee1ad34e6adc164d883049c6c3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:35 +0200 Subject: sched/fair: Wait before decaying max_newidle_lb_cost Decay max_newidle_lb_cost only when it has not been updated for a while and ensure to not decay a recently changed value. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-4-vincent.guittot@linaro.org --- include/linux/sched/topology.h | 2 +- kernel/sched/fair.c | 36 +++++++++++++++++++++++++++--------- kernel/sched/topology.c | 2 +- 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..c07bfa2d80f2 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -105,7 +105,7 @@ struct sched_domain { /* idle_balance() stats */ u64 max_newidle_lb_cost; - unsigned long next_decay_max_lb_cost; + unsigned long last_decay_max_lb_cost; u64 avg_scan_cost; /* select_idle_sibling */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4c36865321b..e50fd751e1df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10239,6 +10239,30 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -10262,14 +10286,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -10911,8 +10930,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; t0 = t1; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e81246787560..30169c7685b6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1568,7 +1568,7 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, + .last_decay_max_lb_cost = jiffies, .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, -- cgit v1.2.3