From 01b1d88b09824bea1a75b0bac04dcf50d9893875 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:38 +0200 Subject: rcu: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in RCU to use CONFIG_PREEMPTION. That's the first step towards RCU on RT. The further tweaks are work in progress. This neither touches the selftest bits which need a closer look by Paul. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.210156346@linutronix.de Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index a7b57dd42c26..c7efbc018f4f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -103,7 +103,7 @@ config STATIC_KEYS_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION config KPROBES_ON_FTRACE def_bool y -- cgit v1.2.3 From 48593975aeee548f25e256c515fd1d1e3fb2cc20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:42 +0200 Subject: x86: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the entry code, preempt and kprobes conditionals over to CONFIG_PREEMPTION. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.608488448@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 6 +++--- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/entry/thunk_32.S | 2 +- arch/x86/entry/thunk_64.S | 4 ++-- arch/x86/include/asm/preempt.h | 2 +- arch/x86/kernel/kprobes/core.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4f86928246e7..f83ca5aa8b77 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -63,7 +63,7 @@ * enough to patch inline, increasing performance. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) @@ -1084,7 +1084,7 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0, PER_CPU_VAR(__preempt_count) jnz .Lno_preempt @@ -1364,7 +1364,7 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp ret_from_intr diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3f5a978a02a7..9701464341e4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -662,7 +662,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) /* Returning to kernel space */ retint_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* Interrupts are off */ /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ @@ -1113,7 +1113,7 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ call xen_evtchn_do_upcall LEAVE_IRQ_STACK -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp error_exit diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index cb3464525b37..2713490611a3 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -34,7 +34,7 @@ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index cc20465b2867..ea5c4167086c 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -46,7 +46,7 @@ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) @@ -55,7 +55,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPT) + || defined(CONFIG_PREEMPTION) .L_restore: popq %r11 popq %r10 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 99a7fa9ab0a3..3d4cb83a8828 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -102,7 +102,7 @@ static __always_inline bool should_resched(int preempt_offset) return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION extern asmlinkage void ___preempt_schedule(void); # define __preempt_schedule() \ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0e0b08008b5a..43fc13c831af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -580,7 +580,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, if (setup_detour_execution(p, regs, reenter)) return; -#if !defined(CONFIG_PREEMPT) +#if !defined(CONFIG_PREEMPTION) if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) -- cgit v1.2.3 From cb376c26971ff54f25980ec1f0ae2f06d6a69df0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:43 +0200 Subject: x86/dumpstack: Indicate PREEMPT_RT in dumps Stack dumps print whether the kernel has preemption enabled or not. Extend it so a PREEMPT_RT enabled kernel can be identified. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.699136351@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b5886401e5f..e07424e19274 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -367,13 +367,18 @@ NOKPROBE_SYMBOL(oops_end); int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", -- cgit v1.2.3 From 09c7e8b21d67c3c78ab9701dbc0fb1e9f14a0ba5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:44 +0200 Subject: x86/kvm: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditional for async pagefaults to use CONFIG_PREEMPTION. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.789755413@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b7f34fe2171e..3d07f84c4846 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -314,7 +314,7 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif pa |= KVM_ASYNC_PF_ENABLED; -- cgit v1.2.3 From a2cbfd46559e809c8165773b7fe8afa058b35414 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 8 Aug 2019 20:53:00 +0100 Subject: arch, ia64: Make NUMA select SMP While it does make sense to allow CONFIG_NUMA and !CONFIG_SMP in theory, it doesn't make much sense in practice. Follow other architectures and make CONFIG_NUMA select CONFIG_SMP. The motivation for this patch is to allow a new NUMA variable to be initialised in kernel/sched/topology.c. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suravee.Suthikulpanit@amd.com Cc: Thomas Gleixner Cc: Thomas.Lendacky@amd.com Cc: Tony Luck Link: https://lkml.kernel.org/r/20190808195301.13222-2-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 7468d8e50467..997baba02b70 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -389,6 +389,7 @@ config NUMA depends on !IA64_HP_SIM && !FLATMEM default y if IA64_SGI_SN2 select ACPI_NUMA if ACPI + select SMP help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option is for configuring high-end multiprocessor -- cgit v1.2.3 From a55c7454a8c887b226a01d7eed088ccb5374d81e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 8 Aug 2019 20:53:01 +0100 Subject: sched/topology: Improve load balancing on AMD EPYC systems SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init() for any sched domains with a NUMA distance greater than 2 hops (RECLAIM_DISTANCE). The idea being that it's expensive to balance across domains that far apart. However, as is rather unfortunately explained in: commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30") the value for RECLAIM_DISTANCE is based on node distance tables from 2011-era hardware. Current AMD EPYC machines have the following NUMA node distances: node distances: node 0 1 2 3 4 5 6 7 0: 10 16 16 16 32 32 32 32 1: 16 10 16 16 32 32 32 32 2: 16 16 10 16 32 32 32 32 3: 16 16 16 10 32 32 32 32 4: 32 32 32 32 10 16 16 16 5: 32 32 32 32 16 10 16 16 6: 32 32 32 32 16 16 10 16 7: 32 32 32 32 16 16 16 10 where 2 hops is 32. The result is that the scheduler fails to load balance properly across NUMA nodes on different sockets -- 2 hops apart. For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4 (CPUs 32-39) like so, $ numactl -C 0-7,32-39 ./spinner 16 causes all threads to fork and remain on node 0 until the active balancer kicks in after a few seconds and forcibly moves some threads to node 4. Override node_reclaim_distance for AMD Zen. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suravee.Suthikulpanit@amd.com Cc: Thomas Gleixner Cc: Thomas.Lendacky@amd.com Cc: Tony Luck Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 5 +++++ include/linux/topology.h | 14 ++++++++++++++ kernel/sched/topology.c | 3 ++- mm/khugepaged.c | 2 +- mm/page_alloc.c | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d4e50428b68..ceeb8afc7cf3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + /* * Fix erratum 1076: CPB feature bit not being set in CPUID. * Always set it, except when running under a hypervisor. diff --git a/include/linux/topology.h b/include/linux/topology.h index 47a3e3c08036..579522ec446c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -59,6 +59,20 @@ int arch_update_cpu_topology(void); */ #define RECLAIM_DISTANCE 30 #endif + +/* + * The following tunable allows platforms to override the default node + * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are + * sufficiently fast that the default value actually hurts + * performance. + * + * AMD EPYC machines use this because even though the 2-hop distance + * is 32 (3.2x slower than a local memory access) performance actually + * *improves* if allowed to reclaim memory and load balance tasks + * between NUMA nodes 2-hops apart. + */ +extern int __read_mostly node_reclaim_distance; + #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8f83e8e3ea9a..b5667a273bf6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1284,6 +1284,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index eaaa21b23215..ccede2425c3f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid) for (i = 0; i < MAX_NUMNODES; i++) { if (!khugepaged_node_load[i]) continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) + if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 272c6de1bf4e..0d54cd2c43a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3522,7 +3522,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= - RECLAIM_DISTANCE; + node_reclaim_distance; } #else /* CONFIG_NUMA */ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) -- cgit v1.2.3