From b5636d45aae42aa345b4c7918bdef245ed63da68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:41 +0200 Subject: x86/cpu: Remove segment load from switch_to_new_gdt() On 32bit FS and on 64bit GS segments are already set up correctly, but load_percpu_segment() still sets [FG]S after switching from the early GDT to the direct GDT. For 32bit the segment load has no side effects, but on 64bit it causes GSBASE to become 0, which means that any per CPU access before GSBASE is set to the new value is going to fault. That's the reason why the whole file containing this code has stackprotector removed. But that's a pointless exercise for both 32 and 64 bit as the relevant segment selector is already correct. Loading the new GDT does not change that. Remove the segment loads and add comments. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.097052006@infradead.org --- arch/x86/kernel/cpu/common.c | 47 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e508f239098..c09abee6f4d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -701,16 +701,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); -void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@ -738,16 +728,41 @@ void load_fixmap_gdt(int cpu) } EXPORT_SYMBOL_GPL(load_fixmap_gdt); -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_to_new_gdt - Switch form early GDT to the direct one + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU + * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per + * CPU area. */ void switch_to_new_gdt(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif } static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -- cgit v1.2.3 From 1f19e2d50baf6515991844eaa8a84a0b0037da70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:42 +0200 Subject: x86/cpu: Get rid of redundant switch_to_new_gdt() invocations The only place where switch_to_new_gdt() is required is early boot to switch from the early GDT to the direct GDT. Any other invocation is completely redundant because it does not change anything. Secondary CPUs come out of the ASM code with GDT and GSBASE correctly set up. The same is true for XEN_PV. Remove all the voodoo invocations which are left overs from the ancient past, rename the function to switch_gdt_and_percpu_base() and mark it init. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.198076128@infradead.org --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 17 ++++++----------- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 6 +++++- arch/x86/xen/enlighten_pv.c | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e21ec970d41a..c660700ecfc6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -667,7 +667,7 @@ extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c09abee6f4d5..f51928dd275a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -729,14 +729,15 @@ void load_fixmap_gdt(int cpu) EXPORT_SYMBOL_GPL(load_fixmap_gdt); /** - * switch_to_new_gdt - Switch form early GDT to the direct one + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base * @cpu: The CPU number for which this is invoked * - * Invoked during early boot to switch from early GDT and early per CPU - * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per - * CPU area. + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { load_direct_gdt(cpu); @@ -2263,12 +2264,6 @@ void cpu_init(void) boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 49325caa7307..555089a5b446 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -211,7 +211,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f3ea0287f69..ce8728d2e5ef 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void) void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f82857e48815..9b892079581b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1209,7 +1209,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; -- cgit v1.2.3 From 2cb15faaedeb67f52f2ddc32b5ca152acfc422c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:43 +0200 Subject: x86/cpu: Re-enable stackprotector Commit 5416c2663517 ("x86: make sure load_percpu_segment has no stackprotector") disabled the stackprotector for cpu/common.c because of load_percpu_segment(). Back then the boot stack canary was initialized very early in start_kernel(). Switching the per CPU area by loading the GDT caused the stackprotector to fail with paravirt enabled kernels as the GSBASE was not updated yet. In hindsight a wrong change because it would have been sufficient to ensure that the canary is the same in both per CPU areas. Commit d55535232c3d ("random: move rand_initialize() earlier") moved the stack canary initialization to a later point in the init sequence. As a consequence the per CPU stack canary is 0 when switching the per CPU areas, so there is no requirement anymore to exclude this file. Add a comment to load_percpu_segment(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.303010511@infradead.org --- arch/x86/kernel/cpu/Makefile | 3 --- arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index f10a921ee756..d7e3ceaf75c1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -# Make sure load_percpu_segment has no stackprotector -CFLAGS_common.o := -fno-stack-protector - obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f51928dd275a..8e873181759a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -752,6 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. */ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else -- cgit v1.2.3 From e57ef2ed97c1d078973298658a8096644a1e9e09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:01 +0200 Subject: x86: Put hot per CPU variables into a struct The layout of per-cpu variables is at the mercy of the compiler. This can lead to random performance fluctuations from build to build. Create a structure to hold some of the hottest per-cpu variables, starting with current_task. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.179707194@infradead.org --- arch/x86/include/asm/current.h | 19 ++++++++++++++++--- arch/x86/kernel/cpu/common.c | 14 +++++--------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 5 files changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b5..63c42ac3cd86 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -3,16 +3,29 @@ #define _ASM_X86_CURRENT_H #include -#include #ifndef __ASSEMBLY__ + +#include +#include + struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +struct pcpu_hot { + union { + struct { + struct task_struct *current_task; + }; + u8 pad[64]; + }; +}; +static_assert(sizeof(struct pcpu_hot) == 64); + +DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); static __always_inline struct task_struct *get_current(void) { - return this_cpu_read_stable(current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e873181759a..52071539a14c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2012,18 +2012,16 @@ static __init int setup_clearcpuid(char *arg) } __setup("clearcpuid=", setup_clearcpuid); +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); @@ -2083,8 +2081,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2f314b170c9f..807da45d84c7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); switch_fpu_finish(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6b3418bff326..c4f6cacf6599 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -617,7 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce8728d2e5ef..05f315777691 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1046,7 +1046,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ -- cgit v1.2.3 From 64701838bf0575ef8acb1ad2db5934e864f3e6c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:02 +0200 Subject: x86/percpu: Move preempt_count next to current_task Add preempt_count to pcpu_hot, since it is once of the most used per-cpu variables. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.284170644@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/preempt.h | 27 ++++++++++++++------------- arch/x86/kernel/cpu/common.c | 8 +------- 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 63c42ac3cd86..0f4b46293c6c 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -15,6 +15,7 @@ struct pcpu_hot { union { struct { struct task_struct *current_task; + int preempt_count; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee24..2d13f25b1bd8 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,11 +4,11 @@ #include #include +#include + #include #include -DECLARE_PER_CPU(int, __preempt_count); - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) @@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc) int old, new; do { - old = raw_cpu_read_4(__preempt_count); + old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } /* @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(pcpu_hot.preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(pcpu_hot.preempt_count, -val); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + __percpu_arg([var])); } /* @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52071539a14c..cafb6bd90d10 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2014,6 +2014,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2022,13 +2023,9 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); - DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; static void wrmsrl_cstar(unsigned long val) @@ -2081,9 +2078,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find * the top of the kernel stack. Use an extra percpu variable to track the -- cgit v1.2.3 From c063a217bc0726c2560138229de5673dbb253a02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:04 +0200 Subject: x86/percpu: Move current_top_of_stack next to current_task Extend the struct pcpu_hot cacheline with current_top_of_stack; another very frequently used value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.493038635@infradead.org --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/entry/entry_64_compat.S | 6 +++--- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/cpu/common.c | 12 +----------- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 11 files changed, 19 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e309e7156038..91397f58ac30 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e635f962afb8..9249a45cf53f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -1209,7 +1209,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1525,7 +1525,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4dd19819053a..1dfee868d4a1 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,7 +58,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -191,7 +191,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -332,7 +332,7 @@ SYM_CODE_START(entry_INT80_compat) ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV movq %rsp, %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp pushq 5*8(%rax) /* regs->ss */ pushq 4*8(%rax) /* regs->rsp */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 8ac6589e9a1b..2dd013128f1e 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; + unsigned long top_of_stack; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c660700ecfc6..c345f3096c80 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -426,8 +426,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* @@ -566,7 +564,7 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ - return this_cpu_read_stable(cpu_current_top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } static __always_inline bool on_thread_stack(void) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cb50589a7102..a9824318e1c5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,6 +109,8 @@ static void __used common(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cafb6bd90d10..408245c2eead 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2015,6 +2015,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2026,8 +2027,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@ -2078,15 +2077,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 807da45d84c7..470c128759ea 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and cpu_current_top_of_stack. This changes + * Reload esp0 and pcpu_hot.top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, + this_cpu_write(pcpu_hot.top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c4f6cacf6599..7f807e8bc923 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -618,7 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ raw_cpu_write(pcpu_hot.current_task, next_p); - this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05f315777691..87863a93e918 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1056,7 +1056,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 178015a820f0..7ac19aba8983 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -851,7 +851,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -869,7 +869,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } -- cgit v1.2.3 From d7b6d709a76a4f4ef3108ac41e1b39eb80f5c084 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:05 +0200 Subject: x86/percpu: Move irq_stack variables next to current_task Further extend struct pcpu_hot with the hard and soft irq stack pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.599170752@infradead.org --- arch/x86/include/asm/current.h | 6 ++++++ arch/x86/include/asm/irq_stack.h | 12 ++++++------ arch/x86/include/asm/processor.h | 4 ---- arch/x86/kernel/cpu/common.c | 3 --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/irq_32.c | 13 +++++-------- arch/x86/kernel/irq_64.c | 6 +++--- arch/x86/kernel/process_64.c | 2 +- 9 files changed, 24 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 2dd013128f1e..ac3090ddf34e 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -18,6 +18,12 @@ struct pcpu_hot { int preempt_count; int cpu_number; unsigned long top_of_stack; + void *hardirq_stack_ptr; +#ifdef CONFIG_X86_64 + bool hardirq_stack_inuse; +#else + void *softirq_stack_ptr; +#endif }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 147cb8fdda92..798183867d78 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c345f3096c80..bdde68744eb3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -448,8 +448,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -458,8 +456,6 @@ void current_save_fsgs(void); #ifdef CONFIG_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ struct perf_event; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 408245c2eead..2bec4b4b2c50 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2024,9 +2024,6 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - static void wrmsrl_cstar(unsigned long val) { /* diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf..b4905d5173fd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a..f05339fee778 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 01833ebf5e8e..dc1049c01f9b 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -138,7 +135,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1c0fb96b9e39..fe0c859873d1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7f807e8bc923..1312de5b76aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -563,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(hardirq_stack_inuse)); + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); -- cgit v1.2.3 From 931ab63664f02b17d2213ef36b83e1e50190a0aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:14 +0200 Subject: x86/ibt: Implement FineIBT Implement an alternative CFI scheme that merges both the fine-grained nature of kCFI but also takes full advantage of the coarse grained hardware CFI as provided by IBT. To contrast: kCFI is a pure software CFI scheme and relies on being able to read text -- specifically the instruction *before* the target symbol, and does the hash validation *before* doing the call (otherwise control flow is compromised already). FineIBT is a software and hardware hybrid scheme; by ensuring every branch target starts with a hash validation it is possible to place the hash validation after the branch. This has several advantages: o the (hash) load is avoided; no memop; no RX requirement. o IBT WAIT-FOR-ENDBR state is a speculation stop; by placing the hash validation in the immediate instruction after the branch target there is a minimal speculation window and the whole is a viable defence against SpectreBHB. o Kees feels obliged to mention it is slightly more vulnerable when the attacker can write code. Obviously this patch relies on kCFI, but additionally it also relies on the padding from the call-depth-tracking patches. It uses this padding to place the hash-validation while the call-sites are re-written to modify the indirect target to be 16 bytes in front of the original target, thus hitting this new preamble. Notably, there is no hardware that needs call-depth-tracking (Skylake) and supports IBT (Tigerlake and onwards). Suggested-by: Joao Moreira (Intel) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.634714496@infradead.org --- arch/um/kernel/um_arch.c | 5 + arch/x86/Kconfig | 14 +- arch/x86/Makefile | 2 +- arch/x86/include/asm/alternative.h | 2 + arch/x86/include/asm/linkage.h | 6 +- arch/x86/kernel/alternative.c | 253 +++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/module.c | 20 ++- arch/x86/kernel/vmlinux.lds.S | 9 ++ include/linux/bpf.h | 2 +- scripts/Makefile.lib | 1 + 11 files changed, 294 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8adf8e89b255..786b44dc20c9 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -444,6 +444,11 @@ void apply_returns(s32 *start, s32 *end) { } +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32818aa1dca4..479ee63898f5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2463,17 +2463,27 @@ config FUNCTION_PADDING_BYTES default FUNCTION_PADDING_CFI if CFI_CLANG default FUNCTION_ALIGNMENT +config CALL_PADDING + def_bool n + depends on CC_HAS_ENTRY_PADDING && OBJTOOL + select FUNCTION_ALIGNMENT_16B + +config FINEIBT + def_bool y + depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + select CALL_PADDING + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL config CALL_THUNKS def_bool n - select FUNCTION_ALIGNMENT_16B + select CALL_PADDING config PREFIX_SYMBOLS def_bool y - depends on CALL_THUNKS && !CFI_CLANG + depends on CALL_PADDING && !CFI_CLANG menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1640e005092b..a3a07df8a609 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,7 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif -ifdef CONFIG_CALL_THUNKS +ifdef CONFIG_CALL_PADDING PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) KBUILD_CFLAGS += $(PADDING_CFLAGS) export PADDING_CFLAGS diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 664c0779375c..7659217f4d49 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -78,6 +78,8 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; struct paravirt_patch_site; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 45e0df850645..dd9b8118f784 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -15,7 +15,7 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#if defined(CONFIG_CALL_THUNKS) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; #else #define FUNCTION_PADDING @@ -57,7 +57,7 @@ #endif /* __ASSEMBLY__ */ /* - * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_THUNKS) the + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the * CFI symbol layout changes. * * Without CALL_THUNKS: @@ -81,7 +81,7 @@ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. */ -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define CFI_PRE_PADDING #define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; #else diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b4ac4e58c010..91b0e63a6238 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -656,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt */ @@ -664,31 +687,232 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) - continue; + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_endbr(addr - 16, false); + } +} + +#else + +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; - if (WARN_ON_ONCE(!is_endbr(endbr))) + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, jmp, 2); + } - /* - * When we have IBT, the lack of ENDBR will trigger #CP - */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + return 0; +} + +/* .cfi_sites */ +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } + + return 0; +} + +/* .retpoline_sites */ +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT)) + return; + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using FineIBT CFI\n"); + + return; + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -996,6 +1220,9 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2bec4b4b2c50..423a760fa9de 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -609,6 +609,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 2fb9de2cef40..0142982e94c5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -255,7 +255,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, - *calls = NULL; + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -277,6 +277,8 @@ int module_finalize(const Elf_Ehdr *hdr, returns = s; if (!strcmp(".call_sites", secstrings + s->sh_name)) calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -289,6 +291,22 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49f3f86433c7..2e0ee14229bf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,15 @@ SECTIONS } #endif +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5296aea9b5b4..923a3d508047 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) #else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2e03bcbf2b9b..2b2fab705a63 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -256,6 +256,7 @@ objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt +objtool-args-$(CONFIG_FINEIBT) += --cfi objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_RETPOLINE) += --retpoline -- cgit v1.2.3