summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-01 13:48:52 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-01 13:48:52 -0700
commit9a7e0a90a454a7826ecbca055a6ec9271b70c686 (patch)
tree4e432bee81d5a7a480241db62edec2edc40069d5 /include
parent57a315cd7198907326e691cc909df2beebc2420d (diff)
parent8ea9183db4ad8afbcb7089a77c23eaf965b0cacd (diff)
downloadlinux-9a7e0a90a454a7826ecbca055a6ec9271b70c686.tar.bz2
Merge tag 'sched-core-2021-11-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: - Revert the printk format based wchan() symbol resolution as it can leak the raw value in case that the symbol is not resolvable. - Make wchan() more robust and work with all kind of unwinders by enforcing that the task stays blocked while unwinding is in progress. - Prevent sched_fork() from accessing an invalid sched_task_group - Improve asymmetric packing logic - Extend scheduler statistics to RT and DL scheduling classes and add statistics for bandwith burst to the SCHED_FAIR class. - Properly account SCHED_IDLE entities - Prevent a potential deadlock when initial priority is assigned to a newly created kthread. A recent change to plug a race between cpuset and __sched_setscheduler() introduced a new lock dependency which is now triggered. Break the lock dependency chain by moving the priority assignment to the thread function. - Fix the idle time reporting in /proc/uptime for NOHZ enabled systems. - Improve idle balancing in general and especially for NOHZ enabled systems. - Provide proper interfaces for live patching so it does not have to fiddle with scheduler internals. - Add cluster aware scheduling support. - A small set of tweaks for RT (irqwork, wait_task_inactive(), various scheduler options and delaying mmdrop) - The usual small tweaks and improvements all over the place * tag 'sched-core-2021-11-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (69 commits) sched/fair: Cleanup newidle_balance sched/fair: Remove sysctl_sched_migration_cost condition sched/fair: Wait before decaying max_newidle_lb_cost sched/fair: Skip update_blocked_averages if we are defering load balance sched/fair: Account update_blocked_averages in newidle_balance cost x86: Fix __get_wchan() for !STACKTRACE sched,x86: Fix L2 cache mask sched/core: Remove rq_relock() sched: Improve wake_up_all_idle_cpus() take #2 irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on PREEMPT_RT irq_work: Handle some irq_work in a per-CPU thread on PREEMPT_RT irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support. sched/rt: Annotate the RT balancing logic irqwork as IRQ_WORK_HARD_IRQ sched: Add cluster scheduler level for x86 sched: Add cluster scheduler level in core and related Kconfig for ARM64 topology: Represent clusters of CPUs within a die sched: Disable -Wunused-but-set-variable sched: Add wrapper for get_wchan() to keep task blocked x86: Fix get_wchan() to support the ORC unwinder proc: Use task_is_running() for wchan in /proc/$pid/stat ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/acpi.h5
-rw-r--r--include/linux/arch_topology.h5
-rw-r--r--include/linux/irq_work.h8
-rw-r--r--include/linux/kernel_stat.h1
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/sched/idle.h4
-rw-r--r--include/linux/sched/mm.h29
-rw-r--r--include/linux/sched/task.h3
-rw-r--r--include/linux/sched/topology.h9
-rw-r--r--include/linux/topology.h13
-rw-r--r--include/linux/wait.h3
12 files changed, 88 insertions, 7 deletions
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 974d497a897d..fbc2146050a4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1353,6 +1353,7 @@ static inline int lpit_read_residency_count_address(u64 *address)
#ifdef CONFIG_ACPI_PPTT
int acpi_pptt_cpu_is_thread(unsigned int cpu);
int find_acpi_cpu_topology(unsigned int cpu, int level);
+int find_acpi_cpu_topology_cluster(unsigned int cpu);
int find_acpi_cpu_topology_package(unsigned int cpu);
int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
@@ -1365,6 +1366,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
{
return -EINVAL;
}
+static inline int find_acpi_cpu_topology_cluster(unsigned int cpu)
+{
+ return -EINVAL;
+}
static inline int find_acpi_cpu_topology_package(unsigned int cpu)
{
return -EINVAL;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index f180240dc95f..b97cea83b25e 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -62,10 +62,12 @@ void topology_set_thermal_pressure(const struct cpumask *cpus,
struct cpu_topology {
int thread_id;
int core_id;
+ int cluster_id;
int package_id;
int llc_id;
cpumask_t thread_sibling;
cpumask_t core_sibling;
+ cpumask_t cluster_sibling;
cpumask_t llc_sibling;
};
@@ -73,13 +75,16 @@ struct cpu_topology {
extern struct cpu_topology cpu_topology[NR_CPUS];
#define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
+#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id)
#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
+#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
#define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+const struct cpumask *cpu_clustergroup_mask(int cpu);
void update_siblings_masks(unsigned int cpu);
void remove_cpu_topology(unsigned int cpuid);
void reset_cpu_topology(void);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index ec2a47a81e42..8cd11a223260 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -3,6 +3,7 @@
#define _LINUX_IRQ_WORK_H
#include <linux/smp_types.h>
+#include <linux/rcuwait.h>
/*
* An entry can be in one of four states:
@@ -16,11 +17,13 @@
struct irq_work {
struct __call_single_node node;
void (*func)(struct irq_work *);
+ struct rcuwait irqwait;
};
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
.node = { .u_flags = (_flags), }, \
.func = (_func), \
+ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \
}
#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
@@ -46,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work)
return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
}
+static inline bool irq_work_is_hard(struct irq_work *work)
+{
+ return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ;
+}
+
bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44ae1a7eb9e3..69ae6b278464 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64,
enum cpu_usage_stat);
extern void account_steal_time(u64);
extern void account_idle_time(u64);
+extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 82dab23205c3..f92686cf2cdb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
+#include <linux/rcupdate.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
@@ -649,6 +650,9 @@ struct mm_struct {
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcu_head delayed_drop;
+#endif
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index caf7071fda87..f44e14c43b74 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -503,6 +503,8 @@ struct sched_statistics {
u64 block_start;
u64 block_max;
+ s64 sum_block_runtime;
+
u64 exec_max;
u64 slice_max;
@@ -522,7 +524,7 @@ struct sched_statistics {
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
#endif
-};
+} ____cacheline_aligned;
struct sched_entity {
/* For load-balancing: */
@@ -538,8 +540,6 @@ struct sched_entity {
u64 nr_migrations;
- struct sched_statistics statistics;
-
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
@@ -775,10 +775,10 @@ struct task_struct {
int normal_prio;
unsigned int rt_priority;
- const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
+ const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
struct rb_node core_node;
@@ -803,6 +803,8 @@ struct task_struct {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
+ struct sched_statistics stats;
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
@@ -2154,6 +2156,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
#endif /* CONFIG_SMP */
extern bool sched_task_on_rq(struct task_struct *p);
+extern unsigned long get_wchan(struct task_struct *p);
/*
* In order to reduce various lock holder preemption latencies provide an
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 22873d276be6..d73d314d59c6 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -11,7 +11,11 @@ enum cpu_idle_type {
CPU_MAX_IDLE_TYPES
};
+#ifdef CONFIG_SMP
extern void wake_up_if_idle(int cpu);
+#else
+static inline void wake_up_if_idle(int cpu) { }
+#endif
/*
* Idle thread specific functions to determine the need_resched
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 5561486fddef..aca874d33fe6 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,35 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
+ * by far the least expensive way to do that.
+ */
+static inline void __mmdrop_delayed(struct rcu_head *rhp)
+{
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+ __mmdrop(mm);
+}
+
+/*
+ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
+ * kernels via RCU.
+ */
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+ /* Provides a full memory barrier. See mmdrop() */
+ if (atomic_dec_and_test(&mm->mm_count))
+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+static inline void mmdrop_sched(struct mm_struct *mm)
+{
+ mmdrop(mm);
+}
+#endif
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf2..ba88a6987400 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,7 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p);
+extern void sched_post_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs);
extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..c07bfa2d80f2 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -42,6 +42,13 @@ static inline int cpu_smt_flags(void)
}
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static inline int cpu_cluster_flags(void)
+{
+ return SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
#ifdef CONFIG_SCHED_MC
static inline int cpu_core_flags(void)
{
@@ -98,7 +105,7 @@ struct sched_domain {
/* idle_balance() stats */
u64 max_newidle_lb_cost;
- unsigned long next_decay_max_lb_cost;
+ unsigned long last_decay_max_lb_cost;
u64 avg_scan_cost; /* select_idle_sibling */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7634cd737061..0b3704ad13c8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -186,6 +186,9 @@ static inline int cpu_to_mem(int cpu)
#ifndef topology_die_id
#define topology_die_id(cpu) ((void)(cpu), -1)
#endif
+#ifndef topology_cluster_id
+#define topology_cluster_id(cpu) ((void)(cpu), -1)
+#endif
#ifndef topology_core_id
#define topology_core_id(cpu) ((void)(cpu), 0)
#endif
@@ -195,6 +198,9 @@ static inline int cpu_to_mem(int cpu)
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
+#ifndef topology_cluster_cpumask
+#define topology_cluster_cpumask(cpu) cpumask_of(cpu)
+#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu) cpumask_of(cpu)
#endif
@@ -206,6 +212,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu)
}
#endif
+#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask)
+static inline const struct cpumask *cpu_cluster_mask(int cpu)
+{
+ return topology_cluster_cpumask(cpu);
+}
+#endif
+
static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 93dab0e9580f..2d0df57c9902 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1160,6 +1160,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
(wait)->flags = 0; \
} while (0)
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg);
+typedef int (*task_call_f)(struct task_struct *p, void *arg);
+extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);
#endif /* _LINUX_WAIT_H */