summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-14 18:29:11 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-14 18:29:11 -0800
commitadb35e8dc98ba9bda99ff79ac6a05b8fcde2a762 (patch)
treeceb0334110d80b5a756764c3d089257c83faaec9 /include
parent533369b145d8d1bc44b8ed7f0dd0ecffb16384cc (diff)
parent5b78f2dc315354c05300795064f587366a02c6ff (diff)
downloadlinux-adb35e8dc98ba9bda99ff79ac6a05b8fcde2a762.tar.bz2
Merge tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: - migrate_disable/enable() support which originates from the RT tree and is now a prerequisite for the new preemptible kmap_local() API which aims to replace kmap_atomic(). - A fair amount of topology and NUMA related improvements - Improvements for the frequency invariant calculations - Enhanced robustness for the global CPU priority tracking and decision making - The usual small fixes and enhancements all over the place * tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits) sched/fair: Trivial correction of the newidle_balance() comment sched/fair: Clear SMT siblings after determining the core is not idle sched: Fix kernel-doc markup x86: Print ratio freq_max/freq_base used in frequency invariance calculations x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC x86, sched: Calculate frequency invariance for AMD systems irq_work: Optimize irq_work_single() smp: Cleanup smp_call_function*() irq_work: Cleanup sched: Limit the amount of NUMA imbalance that can exist at fork time sched/numa: Allow a floating imbalance between NUMA nodes sched: Avoid unnecessary calculation of load imbalance at clone time sched/numa: Rename nr_running and break out the magic number sched: Make migrate_disable/enable() independent of RT sched/topology: Condition EAS enablement on FIE support arm64: Rebuild sched domains on invariance status changes sched/topology,schedutil: Wrap sched domains rebuild sched/uclamp: Allow to reset a task uclamp constraint value sched/core: Fix typos in comments Documentation: scheduler: fix information on arch SD flags, sched_domain and sched_debug ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/cpumask.h6
-rw-r--r--include/linux/irq_work.h33
-rw-r--r--include/linux/irqflags.h8
-rw-r--r--include/linux/kernel.h21
-rw-r--r--include/linux/preempt.h83
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/sched/hotplug.h2
-rw-r--r--include/linux/sched/mm.h5
-rw-r--r--include/linux/sched/topology.h8
-rw-r--r--include/linux/smp.h19
-rw-r--r--include/linux/stop_machine.h5
-rw-r--r--include/uapi/linux/sched/types.h2
13 files changed, 142 insertions, 56 deletions
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bc56287a1ed1..0042ef362511 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU,
CPUHP_AP_ONLINE_IDLE,
+ CPUHP_AP_SCHED_WAIT_EMPTY,
CPUHP_AP_SMPBOOT_THREADS,
CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE,
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f0d895d6ac39..383684e30f12 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
return cpumask_next_and(-1, src1p, src2p);
}
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+ return cpumask_first(srcp);
+}
+
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
/**
* for_each_cpu - iterate over every cpu in a mask
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 30823780c192..ec2a47a81e42 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -14,28 +14,37 @@
*/
struct irq_work {
- union {
- struct __call_single_node node;
- struct {
- struct llist_node llnode;
- atomic_t flags;
- };
- };
+ struct __call_single_node node;
void (*func)(struct irq_work *);
};
+#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
+ .node = { .u_flags = (_flags), }, \
+ .func = (_func), \
+}
+
+#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
+#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
+#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
+
+#define DEFINE_IRQ_WORK(name, _f) \
+ struct irq_work name = IRQ_WORK_INIT(_f)
+
static inline
void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
- atomic_set(&work->flags, 0);
- work->func = func;
+ *work = IRQ_WORK_INIT(func);
}
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
- .flags = ATOMIC_INIT(0), \
- .func = (_f) \
+static inline bool irq_work_is_pending(struct irq_work *work)
+{
+ return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
}
+static inline bool irq_work_is_busy(struct irq_work *work)
+{
+ return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
+}
bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 3ed4e8771b64..8de0e1373de7 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -107,14 +107,14 @@ do { \
current->irq_config = 0; \
} while (0)
-# define lockdep_irq_work_enter(__work) \
+# define lockdep_irq_work_enter(_flags) \
do { \
- if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+ if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
current->irq_config = 1; \
} while (0)
-# define lockdep_irq_work_exit(__work) \
+# define lockdep_irq_work_exit(_flags) \
do { \
- if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+ if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
current->irq_config = 0; \
} while (0)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4b5fd3da5fe8..dbf6018fc312 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -204,6 +204,7 @@ extern int _cond_resched(void);
extern void ___might_sleep(const char *file, int line, int preempt_offset);
extern void __might_sleep(const char *file, int line, int preempt_offset);
extern void __cant_sleep(const char *file, int line, int preempt_offset);
+extern void __cant_migrate(const char *file, int line);
/**
* might_sleep - annotation for functions that can sleep
@@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
# define cant_sleep() \
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
+
+/**
+ * cant_migrate - annotation for functions that cannot migrate
+ *
+ * Will print a stack trace if executed in code which is migratable
+ */
+# define cant_migrate() \
+ do { \
+ if (IS_ENABLED(CONFIG_SMP)) \
+ __cant_migrate(__FILE__, __LINE__); \
+ } while (0)
+
/**
* non_block_start - annotate the start of section where sleeping is prohibited
*
@@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
+# define cant_migrate() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
@@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
-#ifndef CONFIG_PREEMPT_RT
-# define cant_migrate() cant_sleep()
-#else
- /* Placeholder for now */
-# define cant_migrate() do { } while (0)
-#endif
-
/**
* abs - return absolute value of an argument
* @x: the value. If it is unsigned type, it is converted to signed type first.
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0e149c..6df63cbe8bb0 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#endif
-/**
- * migrate_disable - Prevent migration of the current task
+#ifdef CONFIG_SMP
+
+/*
+ * Migrate-Disable and why it is undesired.
*
- * Maps to preempt_disable() which also disables preemption. Use
- * migrate_disable() to annotate that the intent is to prevent migration,
- * but not necessarily preemption.
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
*
- * Can be invoked nested like preempt_disable() and needs the corresponding
- * number of migrate_enable() invocations.
- */
-static __always_inline void migrate_disable(void)
-{
- preempt_disable();
-}
-
-/**
- * migrate_enable - Allow migration of the current task
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ * it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ * migrated away when another CPU becomes available, is now constrained
+ * by the ability to push the higher priority task away, which might itself be
+ * in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
+ *
+ *
+ * The reason we have it anyway.
*
- * Counterpart to migrate_disable().
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
*
- * As migrate_disable() can be invoked nested, only the outermost invocation
- * reenables migration.
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ * migration, so migrate_disable() is not only a problem for
+ * work-conserving schedulers.
*
- * Currently mapped to preempt_enable().
*/
-static __always_inline void migrate_enable(void)
-{
- preempt_enable();
-}
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#else
+
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+
+#endif /* CONFIG_SMP */
#endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd4225b73a1f..7b6fc4a1a963 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -723,6 +723,11 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
+ void *migration_pending;
+#ifdef CONFIG_SMP
+ unsigned short migration_disabled;
+#endif
+ unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 9a62ffdd296f..412cdaba33eb 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
extern int sched_cpu_deactivate(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
extern int sched_cpu_dying(unsigned int cpu);
#else
+# define sched_cpu_wait_empty NULL
# define sched_cpu_dying NULL
#endif
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..a91fb3ad9ec7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
extern void membarrier_exec_mmap(struct mm_struct *mm);
+extern void membarrier_update_current_mm(struct mm_struct *next_mm);
+
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
+static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9ef7bf686a9f..8f0f778b7c91 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
#endif /* !CONFIG_SMP */
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+extern void rebuild_sched_domains_energy(void);
+#else
+static inline void rebuild_sched_domains_energy(void)
+{
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
/**
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9f13966d3d92..70c6f6284dcf 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info);
* structure shares (partial) layout with struct irq_work
*/
struct __call_single_data {
- union {
- struct __call_single_node node;
- struct {
- struct llist_node llist;
- unsigned int flags;
-#ifdef CONFIG_64BIT
- u16 src, dst;
-#endif
- };
- };
+ struct __call_single_node node;
smp_call_func_t func;
void *info;
};
+#define CSD_INIT(_func, _info) \
+ (struct __call_single_data){ .func = (_func), .info = (_info), }
+
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t
__aligned(sizeof(struct __call_single_data));
+#define INIT_CSD(_csd, _func, _info) \
+do { \
+ *(_csd) = CSD_INIT((_func), (_info)); \
+} while (0)
+
/*
* Enqueue a llist_node on the call_single_queue; be very careful, read
* flush_smp_call_function_queue() in detail.
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 76d8b09384a7..30577c3aecf8 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */
cpu_stop_fn_t fn;
+ unsigned long caller;
void *arg;
struct cpu_stop_done *done;
};
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
void stop_machine_unpark(int cpu);
void stop_machine_yield(const struct cpumask *cpumask);
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
#else /* CONFIG_SMP */
#include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
return false;
}
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
#endif /* CONFIG_SMP */
/*
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index c852153ddb0d..f2c4589d4dbf 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -96,6 +96,8 @@ struct sched_param {
* on a CPU with a capacity big enough to fit the specified value.
* A task with a max utilization value smaller than 1024 is more likely
* scheduled on a CPU with no more capacity than the specified value.
+ *
+ * A task utilization boundary can be reset by setting the attribute to -1.
*/
struct sched_attr {
__u32 size;