From b5636d45aae42aa345b4c7918bdef245ed63da68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:41 +0200 Subject: x86/cpu: Remove segment load from switch_to_new_gdt() On 32bit FS and on 64bit GS segments are already set up correctly, but load_percpu_segment() still sets [FG]S after switching from the early GDT to the direct GDT. For 32bit the segment load has no side effects, but on 64bit it causes GSBASE to become 0, which means that any per CPU access before GSBASE is set to the new value is going to fault. That's the reason why the whole file containing this code has stackprotector removed. But that's a pointless exercise for both 32 and 64 bit as the relevant segment selector is already correct. Loading the new GDT does not change that. Remove the segment loads and add comments. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.097052006@infradead.org --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 47 ++++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67c9d73b31fa..e21ec970d41a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -670,7 +670,6 @@ extern struct desc_ptr early_gdt_descr; extern void switch_to_new_gdt(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); -extern void load_percpu_segment(int); extern void cpu_init(void); extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e508f239098..c09abee6f4d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -701,16 +701,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); -void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@ -738,16 +728,41 @@ void load_fixmap_gdt(int cpu) } EXPORT_SYMBOL_GPL(load_fixmap_gdt); -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_to_new_gdt - Switch form early GDT to the direct one + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU + * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per + * CPU area. */ void switch_to_new_gdt(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif } static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -- cgit v1.2.3 From 1f19e2d50baf6515991844eaa8a84a0b0037da70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:42 +0200 Subject: x86/cpu: Get rid of redundant switch_to_new_gdt() invocations The only place where switch_to_new_gdt() is required is early boot to switch from the early GDT to the direct GDT. Any other invocation is completely redundant because it does not change anything. Secondary CPUs come out of the ASM code with GDT and GSBASE correctly set up. The same is true for XEN_PV. Remove all the voodoo invocations which are left overs from the ancient past, rename the function to switch_gdt_and_percpu_base() and mark it init. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.198076128@infradead.org --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 17 ++++++----------- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 6 +++++- arch/x86/xen/enlighten_pv.c | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e21ec970d41a..c660700ecfc6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -667,7 +667,7 @@ extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c09abee6f4d5..f51928dd275a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -729,14 +729,15 @@ void load_fixmap_gdt(int cpu) EXPORT_SYMBOL_GPL(load_fixmap_gdt); /** - * switch_to_new_gdt - Switch form early GDT to the direct one + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base * @cpu: The CPU number for which this is invoked * - * Invoked during early boot to switch from early GDT and early per CPU - * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per - * CPU area. + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { load_direct_gdt(cpu); @@ -2263,12 +2264,6 @@ void cpu_init(void) boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 49325caa7307..555089a5b446 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -211,7 +211,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f3ea0287f69..ce8728d2e5ef 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void) void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f82857e48815..9b892079581b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1209,7 +1209,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; -- cgit v1.2.3 From 2cb15faaedeb67f52f2ddc32b5ca152acfc422c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:43 +0200 Subject: x86/cpu: Re-enable stackprotector Commit 5416c2663517 ("x86: make sure load_percpu_segment has no stackprotector") disabled the stackprotector for cpu/common.c because of load_percpu_segment(). Back then the boot stack canary was initialized very early in start_kernel(). Switching the per CPU area by loading the GDT caused the stackprotector to fail with paravirt enabled kernels as the GSBASE was not updated yet. In hindsight a wrong change because it would have been sufficient to ensure that the canary is the same in both per CPU areas. Commit d55535232c3d ("random: move rand_initialize() earlier") moved the stack canary initialization to a later point in the init sequence. As a consequence the per CPU stack canary is 0 when switching the per CPU areas, so there is no requirement anymore to exclude this file. Add a comment to load_percpu_segment(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.303010511@infradead.org --- arch/x86/kernel/cpu/Makefile | 3 --- arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index f10a921ee756..d7e3ceaf75c1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -# Make sure load_percpu_segment has no stackprotector -CFLAGS_common.o := -fno-stack-protector - obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f51928dd275a..8e873181759a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -752,6 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. */ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else -- cgit v1.2.3 From 4c4eb3ecc91f4fee6d6bf7cfbc1e21f2e38d19ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:44 +0200 Subject: x86/modules: Set VM_FLUSH_RESET_PERMS in module_alloc() Instead of resetting permissions all over the place when freeing module memory tell the vmalloc code to do so. Avoids the exercise for the next upcoming user. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.406703869@infradead.org --- arch/x86/kernel/ftrace.c | 2 -- arch/x86/kernel/kprobes/core.c | 1 - arch/x86/kernel/module.c | 9 +++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bd165004776d..00eac455a3a1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -413,8 +413,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; - set_vm_flush_reset_perms(trampoline); - if (likely(system_state != SYSTEM_BOOTING)) set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index eb8bc82846b9..01b8d956aa76 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -414,7 +414,6 @@ void *alloc_insn_page(void) if (!page) return NULL; - set_vm_flush_reset_perms(page); /* * First make the page read-only, and only then make it executable to * prevent it from being W+X in between. diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index c032edcd3d95..43f011277219 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -74,10 +74,11 @@ void *module_alloc(unsigned long size) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; -- cgit v1.2.3 From b26d66f8dace32c46ce58147002964ce8cdfde5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:45 +0200 Subject: x86/vdso: Ensure all kernel code is seen by objtool extable.c is kernel code and not part of the VDSO Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.512144110@infradead.org --- arch/x86/entry/vdso/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3e88b9df8c8f..3ef611044c8f 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -33,11 +33,12 @@ vobjs32-y += vdso32/vclock_gettime.o vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n +obj-y += vma.o extable.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y +OBJECT_FILES_NON_STANDARD_vma.o := n +OBJECT_FILES_NON_STANDARD_extable.o := n # vDSO images to build vdso_img-$(VDSO64-y) += 64 -- cgit v1.2.3 From 24a9c543d2114d416f84e386c2fa90089bd97e4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:46 +0200 Subject: x86: Sanitize linker script The section ordering in the text section is more than suboptimal: ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT STATIC_CALL_TEXT INDIRECT_THUNK_TEXT ENTRY_TEXT is in a seperate PMD so it can be mapped into the cpu entry area when KPTI is enabled. That means the sections after it are also in a seperate PMD. That's wasteful especially as the indirect thunk text is a hotpath on retpoline enabled systems and the static call text is fairly hot on 32bit. Move the entry text section last so that the other sections share a PMD with the text before it. This is obviously just best effort and not guaranteed when the previous text is just at a PMD boundary. The text section placement needs an overhaul in general. There is e.g. no point to have debugfs, sysfs, cpuhotplug and other rarely used functions next to hot path text. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.614728935@infradead.org --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 15f29053cec4..0e9fc080c417 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -132,18 +132,19 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT - ALIGN_ENTRY_TEXT_BEGIN - ENTRY_TEXT - ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT - STATIC_CALL_TEXT - *(.gnu.warning) - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.*) __indirect_thunk_end = .; #endif + STATIC_CALL_TEXT + + ALIGN_ENTRY_TEXT_BEGIN + ENTRY_TEXT + ALIGN_ENTRY_TEXT_END + *(.gnu.warning) + } :text =0xcccc /* End of text section, which should occupy whole number of pages */ -- cgit v1.2.3 From d49a0626216b95cd4bf696f6acf55f39a16ab0bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:47 +0200 Subject: arch: Introduce CONFIG_FUNCTION_ALIGNMENT Generic function-alignment infrastructure. Architectures can select FUNCTION_ALIGNMENT_xxB symbols; the FUNCTION_ALIGNMENT symbol is then set to the largest such selected size, 0 otherwise. From this the -falign-functions compiler argument and __ALIGN macro are set. This incorporates the DEBUG_FORCE_FUNCTION_ALIGN_64B knob and future alignment requirements for x86_64 (later in this series) into a single place. NOTE: also removes the 0x90 filler byte from the generic __ALIGN primitive, that value makes no sense outside of x86. NOTE: .balign 0 reverts to a no-op. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.719248727@infradead.org --- Makefile | 4 ++-- arch/Kconfig | 24 ++++++++++++++++++++++++ arch/ia64/Kconfig | 1 + arch/ia64/Makefile | 2 +- arch/x86/Kconfig | 2 ++ arch/x86/boot/compressed/head_64.S | 8 ++++++++ arch/x86/include/asm/linkage.h | 4 +--- include/asm-generic/vmlinux.lds.h | 4 ++-- include/linux/linkage.h | 4 ++-- lib/Kconfig.debug | 1 + 10 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index f41ec8c8426b..141e1bcc0671 100644 --- a/Makefile +++ b/Makefile @@ -1004,8 +1004,8 @@ KBUILD_CFLAGS += $(CC_FLAGS_CFI) export CC_FLAGS_CFI endif -ifdef CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B -KBUILD_CFLAGS += -falign-functions=64 +ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) +KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT) endif # arch Makefile may override CC so keep this after arch Makefile is included diff --git a/arch/Kconfig b/arch/Kconfig index 8f138e580d1a..402580253802 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1428,4 +1428,28 @@ source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" +config FUNCTION_ALIGNMENT_4B + bool + +config FUNCTION_ALIGNMENT_8B + bool + +config FUNCTION_ALIGNMENT_16B + bool + +config FUNCTION_ALIGNMENT_32B + bool + +config FUNCTION_ALIGNMENT_64B + bool + +config FUNCTION_ALIGNMENT + int + default 64 if FUNCTION_ALIGNMENT_64B + default 32 if FUNCTION_ALIGNMENT_32B + default 16 if FUNCTION_ALIGNMENT_16B + default 8 if FUNCTION_ALIGNMENT_8B + default 4 if FUNCTION_ALIGNMENT_4B + default 0 + endmenu diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c6e06cdc738f..d7e4a24e8644 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -63,6 +63,7 @@ config IA64 select NUMA if !FLATMEM select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select ZONE_DMA32 + select FUNCTION_ALIGNMENT_32B default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 56c4bb276b6e..d553ab7022fe 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -23,7 +23,7 @@ KBUILD_AFLAGS_KERNEL := -mconstant-gp EXTRA := cflags-y := -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f12-f15,f32-f127 \ - -falign-functions=32 -frename-registers -fno-optimize-sibling-calls + -frename-registers -fno-optimize-sibling-calls KBUILD_CFLAGS_KERNEL := -mconstant-gp GAS_STATUS = $(shell $(srctree)/arch/ia64/scripts/check-gas "$(CC)" "$(OBJDUMP)") diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a..f408fa87ed94 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -290,6 +290,8 @@ config X86 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX + select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16 + select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d33f060900d2..190b803eb787 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -37,6 +37,14 @@ #include #include "pgtable.h" +/* + * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result + * in assembly errors due to trying to move .org backward due to the excessive + * alignment. + */ +#undef __ALIGN +#define __ALIGN .balign 16, 0x90 + /* * Locally defined symbols should be marked hidden: */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f484d656d34e..9ee0e2851742 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -14,10 +14,8 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) -#define __ALIGN .p2align 4, 0x90 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#endif #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c15de165ec8f..335b5711a7ed 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -81,8 +81,8 @@ #define RO_EXCEPTION_TABLE #endif -/* Align . to a 8 byte boundary equals to maximum function alignment. */ -#define ALIGN_FUNCTION() . = ALIGN(8) +/* Align . function alignment. */ +#define ALIGN_FUNCTION() . = ALIGN(CONFIG_FUNCTION_ALIGNMENT) /* * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 1feab6136b5b..5c8865bb59d9 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -69,8 +69,8 @@ #endif #ifndef __ALIGN -#define __ALIGN .align 4,0x90 -#define __ALIGN_STR ".align 4,0x90" +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT +#define __ALIGN_STR __stringify(__ALIGN) #endif #ifdef __ASSEMBLY__ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa..e90dc6738534 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -467,6 +467,7 @@ config SECTION_MISMATCH_WARN_ONLY config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC) + select FUNCTION_ALIGNMENT_64B help There are cases that a commit from one domain changes the function address alignment of other domains, and cause magic performance -- cgit v1.2.3 From 8eb5d34e77c63fde8af21c691bcf6e3cd87f7829 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:48 +0200 Subject: x86/asm: Differentiate between code and function alignment Create SYM_F_ALIGN to differentiate alignment requirements between SYM_CODE and SYM_FUNC. This distinction is useful later when adding padding in front of functions; IOW this allows following the compiler's patchable-function-entry option. [peterz: Changelog] Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.824822743@infradead.org --- arch/x86/include/asm/linkage.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9ee0e2851742..c2d6e2733b11 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -12,11 +12,15 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ -#ifdef __ASSEMBLY__ - #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) +#define ASM_FUNC_ALIGN __ALIGN_STR +#define __FUNC_ALIGN __ALIGN +#define SYM_F_ALIGN __FUNC_ALIGN + +#ifdef __ASSEMBLY__ + #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_RETPOLINE */ @@ -55,7 +59,7 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ @@ -65,7 +69,7 @@ /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ @@ -75,7 +79,7 @@ /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ -- cgit v1.2.3 From 1934dc9a8a92fda36ab80cfd55edab1708dcdf9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:49 +0200 Subject: x86/error_inject: Align function properly Ensure inline asm functions are consistently aligned with compiler generated and SYM_FUNC_START*() functions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.930201368@infradead.org --- arch/x86/lib/error-inject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 1e3de0769b81..b5a6d83106bc 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -11,6 +11,7 @@ asm( ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" + ASM_FUNC_ALIGN "just_return_func:\n" ANNOTATE_NOENDBR ASM_RET -- cgit v1.2.3 From 1d293758e548aa6ff65e4dd3f5a9bc2a34b38ce3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:50 +0200 Subject: x86/paravirt: Properly align PV functions Ensure inline asm functions are consistently aligned with compiler generated and SYM_FUNC_START*() functions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220915111144.038540008@infradead.org --- arch/x86/include/asm/paravirt.h | 1 + arch/x86/include/asm/qspinlock_paravirt.h | 2 +- arch/x86/kernel/kvm.c | 1 + arch/x86/kernel/paravirt.c | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2a0b8dd4ec33..1be66c15ecbd 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -665,6 +665,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ + ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 60ece592b220..082551b3c75e 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -40,7 +40,7 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); asm (".pushsection .spinlock.text;" ".globl " PV_UNLOCK ";" ".type " PV_UNLOCK ", @function;" - ".align 4,0x90;" + ASM_FUNC_ALIGN PV_UNLOCK ": " ASM_ENDBR FRAME_BEGIN diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d4e48b4a438b..95fb85bea111 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -802,6 +802,7 @@ asm( ".pushsection .text;" ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +ASM_FUNC_ALIGN "__raw_callee_save___kvm_vcpu_is_preempted:" ASM_ENDBR "movq __per_cpu_offset(,%rdi,8), %rax;" diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7ca2d46c08cc..e244c49b52d7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -40,6 +40,7 @@ extern void _paravirt_nop(void); asm (".pushsection .entry.text, \"ax\"\n" ".global _paravirt_nop\n" + ASM_FUNC_ALIGN "_paravirt_nop:\n\t" ASM_ENDBR ASM_RET @@ -50,6 +51,7 @@ asm (".pushsection .entry.text, \"ax\"\n" /* stub always returning 0. */ asm (".pushsection .entry.text, \"ax\"\n" ".global paravirt_ret0\n" + ASM_FUNC_ALIGN "paravirt_ret0:\n\t" ASM_ENDBR "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" -- cgit v1.2.3 From 67e93ddd5d0b84ac17bddb13d98533e425282421 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:51 +0200 Subject: x86/entry: Align SYM_CODE_START() variants Explicitly align a bunch of commonly called SYM_CODE_START() symbols. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.144068841@infradead.org --- arch/x86/entry/entry_64.S | 16 ++++++++++------ arch/x86/entry/thunk_64.S | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9953d966d124..e635f962afb8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -284,7 +284,8 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread movq %rax, %rdi @@ -600,13 +601,13 @@ SYM_CODE_END(\asmsym) * shared between 32 and 64 bit and emit the __irqentry_text_* markers * so the stacktrace boundary checks work. */ - .align 16 + __ALIGN .globl __irqentry_text_start __irqentry_text_start: #include - .align 16 + __ALIGN .globl __irqentry_text_end __irqentry_text_end: ANNOTATE_NOENDBR @@ -828,7 +829,8 @@ EXPORT_SYMBOL(asm_load_gs_index) * * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) + __FUNC_ALIGN +SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -856,7 +858,8 @@ SYM_CODE_END(exc_xen_hypervisor_callback) * We distinguish between categories by comparing each saved segment register * with its current contents: any discrepancy means we in category 1. */ -SYM_CODE_START(xen_failsafe_callback) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(xen_failsafe_callback) UNWIND_HINT_EMPTY ENDBR movl %ds, %ecx @@ -1516,7 +1519,8 @@ SYM_CODE_END(ignore_sysret) #endif .pushsection .text, "ax" -SYM_CODE_START(rewind_stack_and_make_dead) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index f38b07d2768b..5e37f41e5f14 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -11,7 +11,7 @@ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func -SYM_FUNC_START_NOALIGN(\name) +SYM_FUNC_START(\name) pushq %rbp movq %rsp, %rbp @@ -36,7 +36,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_thunk) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) +SYM_CODE_START_LOCAL(__thunk_restore) popq %r11 popq %r10 popq %r9 -- cgit v1.2.3 From f6dabc817e1f0da8f4088734ad0b9814adad0bce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:52 +0200 Subject: crypto: x86/camellia: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.248229966@infradead.org --- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 2 -- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 4 ---- 2 files changed, 6 deletions(-) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 2e1658ddbe1a..4a30618281ec 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -712,7 +712,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk16) /* input: * %rdi: ctx, CTX @@ -799,7 +798,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk16) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0e4e9abbf4de..deaf62aa73a6 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -221,7 +221,6 @@ * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, @@ -229,7 +228,6 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c RET; SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, @@ -748,7 +746,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk32) /* input: * %rdi: ctx, CTX @@ -835,7 +832,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk32) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk32) /* input: * %rdi: ctx, CTX -- cgit v1.2.3 From 88cdf02551f9aef9284d778ecd375c400555d900 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:53 +0200 Subject: crypto: x86/cast5: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.353555711@infradead.org --- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b258af420c92..0326a01503c3 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -208,7 +208,6 @@ .text -.align 16 SYM_FUNC_START_LOCAL(__cast5_enc_blk16) /* input: * %rdi: ctx @@ -282,7 +281,6 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) RET; SYM_FUNC_END(__cast5_enc_blk16) -.align 16 SYM_FUNC_START_LOCAL(__cast5_dec_blk16) /* input: * %rdi: ctx -- cgit v1.2.3 From ba1b270c20dfb7f7b7a076b1a97ef4b7dcb539b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:54 +0200 Subject: crypto: x86/crct10dif-pcl: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.456602381@infradead.org --- arch/x86/crypto/crct10dif-pcl-asm_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 721474abfb71..5286db5b8165 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -94,7 +94,6 @@ # # Assumes len >= 16. # -.align 16 SYM_FUNC_START(crc_t10dif_pcl) movdqa .Lbswap_mask(%rip), BSWAP_MASK -- cgit v1.2.3 From 8b44221671ec45d725a4558ff7aa5ea90ecfc885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:55 +0200 Subject: crypto: x86/serpent: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.558544791@infradead.org --- arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 2 -- arch/x86/crypto/serpent-avx2-asm_64.S | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 82f2313f512b..97e283621851 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) RET; SYM_FUNC_END(__serpent_enc_blk8_avx) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 8ea34c9b9316..6d60c50593a9 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk16) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16) RET; SYM_FUNC_END(__serpent_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk16) /* input: * %rdi: ctx, CTX -- cgit v1.2.3 From c2a3ce6fdb122b12bc4cfffd28ecf8a9fb0d6736 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:56 +0200 Subject: crypto: x86/sha1: Remove custom alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.662580589@infradead.org --- arch/x86/crypto/sha1_ni_asm.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 2f94ec0e763b..cd943b2af2c4 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -92,7 +92,6 @@ * numBlocks: Number of blocks to process */ .text -.align 32 SYM_FUNC_START(sha1_ni_transform) push %rbp mov %rsp, %rbp -- cgit v1.2.3 From 3ba56d0b87113785413dfc5b9910d45001cc4eeb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:57 +0200 Subject: crypto: x86/sha256: Remove custom alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.766564176@infradead.org --- arch/x86/crypto/sha256-avx-asm.S | 1 - arch/x86/crypto/sha256-avx2-asm.S | 1 - arch/x86/crypto/sha256-ssse3-asm.S | 1 - arch/x86/crypto/sha256_ni_asm.S | 1 - 4 files changed, 4 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 3baa1ec39097..3649370690c5 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -347,7 +347,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_avx) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9bcdbc47b8b4..c4c1dc5ee078 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -524,7 +524,6 @@ STACK_SIZE = _CTX + _CTX_SIZE ######################################################################## .text SYM_FUNC_START(sha256_transform_rorx) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c4a5db612c32..96b7dcdeaebe 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -356,7 +356,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_ssse3) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 94d50dd27cb5..b3f1a1a12027 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -96,7 +96,6 @@ */ .text -.align 32 SYM_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ -- cgit v1.2.3 From 2f93238b87ddbbe1b050ec48ab5843fc61346adb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:58 +0200 Subject: crypto: x86/sm[34]: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. ( this code couldn't seem to make up it's mind about what alignment it actually wanted, randomly mixing 8 and 16 bytes ) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.868540856@infradead.org --- arch/x86/crypto/sm3-avx-asm_64.S | 1 - arch/x86/crypto/sm4-aesni-avx-asm_64.S | 7 ------- arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 6 ------ 3 files changed, 14 deletions(-) diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S index b12b9efb5ec5..b28d804ee10d 100644 --- a/arch/x86/crypto/sm3-avx-asm_64.S +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -327,7 +327,6 @@ * void sm3_transform_avx(struct sm3_state *state, * const u8 *data, int nblocks); */ -.align 16 SYM_FUNC_START(sm3_transform_avx) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 4767ab61ff48..e13c8537b2ec 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -139,13 +139,11 @@ .text -.align 16 /* * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt4) /* input: * %rdi: round key array, CTX @@ -249,7 +247,6 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4) RET; SYM_FUNC_END(sm4_aesni_avx_crypt4) -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) /* input: * %rdi: round key array, CTX @@ -363,7 +360,6 @@ SYM_FUNC_END(__sm4_crypt_blk8) * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt8) /* input: * %rdi: round key array, CTX @@ -419,7 +415,6 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8) * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) /* input: * %rdi: round key array, CTX @@ -494,7 +489,6 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) /* input: * %rdi: round key array, CTX @@ -544,7 +538,6 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) /* input: * %rdi: round key array, CTX diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index 4732fe8bb65b..2212705f7da6 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -153,9 +153,6 @@ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef .text -.align 16 - -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) /* input: * %rdi: round key array, CTX @@ -281,7 +278,6 @@ SYM_FUNC_END(__sm4_crypt_blk16) * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) /* input: * %rdi: round key array, CTX @@ -394,7 +390,6 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) /* input: * %rdi: round key array, CTX @@ -448,7 +443,6 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) /* input: * %rdi: round key array, CTX -- cgit v1.2.3 From e2c9475e88f70820270ca5cc81a1ae2fda262278 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:59 +0200 Subject: crypto: twofish: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.971229477@infradead.org --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 31f9b2ec3857..12fde271cd3f 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -228,7 +228,6 @@ vpxor x2, wkey, x2; \ vpxor x3, wkey, x3; -.align 8 SYM_FUNC_START_LOCAL(__twofish_enc_blk8) /* input: * %rdi: ctx, CTX @@ -270,7 +269,6 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8) RET; SYM_FUNC_END(__twofish_enc_blk8) -.align 8 SYM_FUNC_START_LOCAL(__twofish_dec_blk8) /* input: * %rdi: ctx, CTX -- cgit v1.2.3 From fdc9ee7e97aa2c1dfa7ebb092fffec40ffa59108 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:00 +0200 Subject: crypto: x86/poly1305: Remove custom function alignment SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.073285765@infradead.org --- arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 2077ce7a5647..b9abcd79c1f4 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -108,7 +108,6 @@ if (!$kernel) { sub declare_function() { my ($name, $align, $nargs) = @_; if($kernel) { - $code .= ".align $align\n"; $code .= "SYM_FUNC_START($name)\n"; $code .= ".L$name:\n"; } else { -- cgit v1.2.3 From e57ef2ed97c1d078973298658a8096644a1e9e09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:01 +0200 Subject: x86: Put hot per CPU variables into a struct The layout of per-cpu variables is at the mercy of the compiler. This can lead to random performance fluctuations from build to build. Create a structure to hold some of the hottest per-cpu variables, starting with current_task. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.179707194@infradead.org --- arch/x86/include/asm/current.h | 19 ++++++++++++++++--- arch/x86/kernel/cpu/common.c | 14 +++++--------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b5..63c42ac3cd86 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -3,16 +3,29 @@ #define _ASM_X86_CURRENT_H #include -#include #ifndef __ASSEMBLY__ + +#include +#include + struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +struct pcpu_hot { + union { + struct { + struct task_struct *current_task; + }; + u8 pad[64]; + }; +}; +static_assert(sizeof(struct pcpu_hot) == 64); + +DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); static __always_inline struct task_struct *get_current(void) { - return this_cpu_read_stable(current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e873181759a..52071539a14c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2012,18 +2012,16 @@ static __init int setup_clearcpuid(char *arg) } __setup("clearcpuid=", setup_clearcpuid); +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); @@ -2083,8 +2081,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2f314b170c9f..807da45d84c7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); switch_fpu_finish(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6b3418bff326..c4f6cacf6599 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -617,7 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce8728d2e5ef..05f315777691 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1046,7 +1046,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ -- cgit v1.2.3 From 64701838bf0575ef8acb1ad2db5934e864f3e6c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:02 +0200 Subject: x86/percpu: Move preempt_count next to current_task Add preempt_count to pcpu_hot, since it is once of the most used per-cpu variables. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.284170644@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/preempt.h | 27 ++++++++++++++------------- arch/x86/kernel/cpu/common.c | 8 +------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 63c42ac3cd86..0f4b46293c6c 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -15,6 +15,7 @@ struct pcpu_hot { union { struct { struct task_struct *current_task; + int preempt_count; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee24..2d13f25b1bd8 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,11 +4,11 @@ #include #include +#include + #include #include -DECLARE_PER_CPU(int, __preempt_count); - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) @@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc) int old, new; do { - old = raw_cpu_read_4(__preempt_count); + old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } /* @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(pcpu_hot.preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(pcpu_hot.preempt_count, -val); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + __percpu_arg([var])); } /* @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52071539a14c..cafb6bd90d10 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2014,6 +2014,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2022,13 +2023,9 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); - DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; static void wrmsrl_cstar(unsigned long val) @@ -2081,9 +2078,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find * the top of the kernel stack. Use an extra percpu variable to track the -- cgit v1.2.3 From 7443b296e699e6922f5be243c8d2e316de8cacbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:03 +0200 Subject: x86/percpu: Move cpu_number next to current_task Also add cpu_number to the pcpu_hot structure, it is often referenced and this cacheline is there. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.387678283@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/smp.h | 12 +++++------- arch/x86/kernel/setup_percpu.c | 5 +---- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 0f4b46293c6c..8ac6589e9a1b 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -16,6 +16,7 @@ struct pcpu_hot { struct { struct task_struct *current_task; int preempt_count; + int cpu_number; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index a73bced40e24..b4dbb20dab1a 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -3,10 +3,10 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include -#include -#include #include +#include +#include extern int smp_num_siblings; extern unsigned int num_processors; @@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); @@ -150,11 +149,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); /* * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. + * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(cpu_number) -#define __smp_processor_id() __this_cpu_read(cpu_number) +#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) +#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 555089a5b446..c2fc4c41c164 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,9 +23,6 @@ #include #include -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - #ifdef CONFIG_X86_64 #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) #else @@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the -- cgit v1.2.3 From c063a217bc0726c2560138229de5673dbb253a02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:04 +0200 Subject: x86/percpu: Move current_top_of_stack next to current_task Extend the struct pcpu_hot cacheline with current_top_of_stack; another very frequently used value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.493038635@infradead.org --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/entry/entry_64_compat.S | 6 +++--- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/cpu/common.c | 12 +----------- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 11 files changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e309e7156038..91397f58ac30 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e635f962afb8..9249a45cf53f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -1209,7 +1209,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1525,7 +1525,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4dd19819053a..1dfee868d4a1 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,7 +58,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -191,7 +191,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -332,7 +332,7 @@ SYM_CODE_START(entry_INT80_compat) ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV movq %rsp, %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp pushq 5*8(%rax) /* regs->ss */ pushq 4*8(%rax) /* regs->rsp */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 8ac6589e9a1b..2dd013128f1e 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; + unsigned long top_of_stack; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c660700ecfc6..c345f3096c80 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -426,8 +426,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* @@ -566,7 +564,7 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ - return this_cpu_read_stable(cpu_current_top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } static __always_inline bool on_thread_stack(void) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cb50589a7102..a9824318e1c5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,6 +109,8 @@ static void __used common(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cafb6bd90d10..408245c2eead 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2015,6 +2015,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2026,8 +2027,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@ -2078,15 +2077,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 807da45d84c7..470c128759ea 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and cpu_current_top_of_stack. This changes + * Reload esp0 and pcpu_hot.top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, + this_cpu_write(pcpu_hot.top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c4f6cacf6599..7f807e8bc923 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -618,7 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ raw_cpu_write(pcpu_hot.current_task, next_p); - this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05f315777691..87863a93e918 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1056,7 +1056,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 178015a820f0..7ac19aba8983 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -851,7 +851,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -869,7 +869,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } -- cgit v1.2.3 From d7b6d709a76a4f4ef3108ac41e1b39eb80f5c084 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:05 +0200 Subject: x86/percpu: Move irq_stack variables next to current_task Further extend struct pcpu_hot with the hard and soft irq stack pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.599170752@infradead.org --- arch/x86/include/asm/current.h | 6 ++++++ arch/x86/include/asm/irq_stack.h | 12 ++++++------ arch/x86/include/asm/processor.h | 4 ---- arch/x86/kernel/cpu/common.c | 3 --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/irq_32.c | 13 +++++-------- arch/x86/kernel/irq_64.c | 6 +++--- arch/x86/kernel/process_64.c | 2 +- 9 files changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 2dd013128f1e..ac3090ddf34e 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -18,6 +18,12 @@ struct pcpu_hot { int preempt_count; int cpu_number; unsigned long top_of_stack; + void *hardirq_stack_ptr; +#ifdef CONFIG_X86_64 + bool hardirq_stack_inuse; +#else + void *softirq_stack_ptr; +#endif }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 147cb8fdda92..798183867d78 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c345f3096c80..bdde68744eb3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -448,8 +448,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -458,8 +456,6 @@ void current_save_fsgs(void); #ifdef CONFIG_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ struct perf_event; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 408245c2eead..2bec4b4b2c50 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2024,9 +2024,6 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - static void wrmsrl_cstar(unsigned long val) { /* diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf..b4905d5173fd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a..f05339fee778 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 01833ebf5e8e..dc1049c01f9b 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -138,7 +135,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1c0fb96b9e39..fe0c859873d1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7f807e8bc923..1312de5b76aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -563,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(hardirq_stack_inuse)); + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); -- cgit v1.2.3 From 7fcecafebed90d03f35bec6e147fc0b5f6e1bc71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:06 +0200 Subject: x86/softirq: Move softirq pending next to current task Another hot variable which is strict per CPU and benefits from being in the same cache line. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.702133710@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/hardirq.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index ac3090ddf34e..b89aba077b84 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -19,6 +19,7 @@ struct pcpu_hot { int cpu_number; unsigned long top_of_stack; void *hardirq_stack_ptr; + u16 softirq_pending; #ifdef CONFIG_X86_64 bool hardirq_stack_inuse; #else diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310..66837b8c67f1 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,9 +3,9 @@ #define _ASM_X86_HARDIRQ_H #include +#include typedef struct { - u16 __softirq_pending; #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif @@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +#define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) -- cgit v1.2.3 From 5b71ac8a2a3185da34a6556e791b533b48183a41 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 17 Oct 2022 16:41:06 +0200 Subject: x86: Fixup asm-offsets duplicate It turns out that 'stack_canary_offset' is a variable name; shadowing that with a #define is ripe of fail when the asm-offsets.h header gets included. Rename the thing. Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/asm-offsets_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9249a45cf53f..5c578a7dfcd7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary #endif /* diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9b698215d261..bb65371ea9df 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,7 +57,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); + OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); BLANK(); #endif return 0; -- cgit v1.2.3 From 61c6065ef7ec0447a280179d04b2d81c80c2f479 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:07 +0200 Subject: objtool: Allow !PC relative relocations Objtool doesn't currently much like per-cpu usage in alternatives: arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xf: unsupported relocation in alternatives section f: 65 c7 04 25 00 00 00 00 00 00 00 80 movl $0x80000000,%gs:0x0 13: R_X86_64_32S __x86_call_depth Since the R_X86_64_32S relocation is location invariant (it's computation doesn't include P - the address of the location itself), it can be trivially allowed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.806607235@infradead.org --- tools/objtool/arch/x86/decode.c | 24 ++++++++++++++++++++++++ tools/objtool/check.c | 2 +- tools/objtool/include/objtool/arch.h | 2 ++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1c253b4b7ce0..f0943830add7 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -73,6 +73,30 @@ unsigned long arch_jump_destination(struct instruction *insn) return insn->offset + insn->len + insn->immediate; } +bool arch_pc_relative_reloc(struct reloc *reloc) +{ + /* + * All relocation types where P (the address of the target) + * is included in the computation. + */ + switch (reloc->type) { + case R_X86_64_PC8: + case R_X86_64_PC16: + case R_X86_64_PC32: + case R_X86_64_PC64: + + case R_X86_64_PLT32: + case R_X86_64_GOTPC32: + case R_X86_64_GOTPCREL: + return true; + + default: + break; + } + + return false; +} + #define ADD_OP(op) \ if (!(op = calloc(1, sizeof(*op)))) \ return -1; \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 43ec14c29a60..7174bba14494 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1645,7 +1645,7 @@ static int handle_group_alt(struct objtool_file *file, * accordingly. */ alt_reloc = insn_reloc(file, insn); - if (alt_reloc && + if (alt_reloc && arch_pc_relative_reloc(alt_reloc) && !arch_support_alt_relocation(special_alt, insn, alt_reloc)) { WARN_FUNC("unsupported relocation in alternatives section", diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index beb2f3aa94ff..fe2ea4b892c3 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -93,4 +93,6 @@ bool arch_is_rethunk(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); +bool arch_pc_relative_reloc(struct reloc *reloc); + #endif /* _ARCH_H */ -- cgit v1.2.3 From 6644ee846cb983437063da8fd24b7cae671fd019 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:08 +0200 Subject: objtool: Track init section For future usage of .init.text exclusion track the init section in the instruction decoder and use the result in retpoline validation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.910334431@infradead.org --- tools/objtool/check.c | 17 ++++++++++------- tools/objtool/include/objtool/elf.h | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7174bba14494..bb7c8196bf57 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -382,6 +382,15 @@ static int decode_instructions(struct objtool_file *file) !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; + /* + * .init.text code is ran before userspace and thus doesn't + * strictly need retpolines, except for modules which are + * loaded late, they very much do need retpoline in their + * .init.text + */ + if (!strcmp(sec->name, ".init.text") && !opts.module) + sec->init = true; + for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { insn = malloc(sizeof(*insn)); if (!insn) { @@ -3748,13 +3757,7 @@ static int validate_retpoline(struct objtool_file *file) if (insn->retpoline_safe) continue; - /* - * .init.text code is ran before userspace and thus doesn't - * strictly need retpolines, except for modules which are - * loaded late, they very much do need retpoline in their - * .init.text - */ - if (!strcmp(insn->sec->name, ".init.text") && !opts.module) + if (insn->sec->init) continue; if (insn->type == INSN_RETURN) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 16f4067b82ae..baa808583c4f 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,7 @@ struct section { Elf_Data *data; char *name; int idx; - bool changed, text, rodata, noinstr; + bool changed, text, rodata, noinstr, init; }; struct symbol { -- cgit v1.2.3 From 00abd38408127a57861698a8bffba65849de6bbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:09 +0200 Subject: objtool: Add .call_sites section In preparation for call depth tracking provide a section which collects all direct calls. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.016511961@infradead.org --- arch/x86/kernel/vmlinux.lds.S | 7 +++++ tools/objtool/check.c | 51 +++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/objtool.h | 1 + tools/objtool/objtool.c | 1 + 4 files changed, 60 insertions(+) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0e9fc080c417..b69df9e013cc 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,13 @@ SECTIONS *(.return_sites) __return_sites_end = .; } + + . = ALIGN(8); + .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) { + __call_sites = .; + *(.call_sites) + __call_sites_end = .; + } #endif #ifdef CONFIG_X86_KERNEL_IBT diff --git a/tools/objtool/check.c b/tools/objtool/check.c index bb7c8196bf57..f578e030e8bb 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -902,6 +902,49 @@ static int create_mcount_loc_sections(struct objtool_file *file) return 0; } +static int create_direct_call_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + unsigned int *loc; + int idx; + + sec = find_section_by_name(file->elf, ".call_sites"); + if (sec) { + INIT_LIST_HEAD(&file->call_list); + WARN("file already has .call_sites section, skipping"); + return 0; + } + + if (list_empty(&file->call_list)) + return 0; + + idx = 0; + list_for_each_entry(insn, &file->call_list, call_node) + idx++; + + sec = elf_create_section(file->elf, ".call_sites", 0, sizeof(unsigned int), idx); + if (!sec) + return -1; + + idx = 0; + list_for_each_entry(insn, &file->call_list, call_node) { + + loc = (unsigned int *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned int)); + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned int), + R_X86_64_PC32, + insn->sec, insn->offset)) + return -1; + + idx++; + } + + return 0; +} + /* * Warnings shouldn't be reported for ignored functions. */ @@ -1279,6 +1322,9 @@ static void annotate_call_site(struct objtool_file *file, return; } + if (insn->type == INSN_CALL && !insn->sec->init) + list_add_tail(&insn->call_node, &file->call_list); + if (!sibling && dead_end_function(file, sym)) insn->dead_end = true; } @@ -4305,6 +4351,11 @@ int check(struct objtool_file *file) if (ret < 0) goto out; warnings += ret; + + ret = create_direct_call_sections(file); + if (ret < 0) + goto out; + warnings += ret; } if (opts.mcount) { diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index 7f2d1b095333..6b40977bcdb1 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -28,6 +28,7 @@ struct objtool_file { struct list_head static_call_list; struct list_head mcount_loc_list; struct list_head endbr_list; + struct list_head call_list; bool ignore_unreachables, hints, rodata; unsigned int nr_endbr; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index a7ecc32e3512..6affd8067f83 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -106,6 +106,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); INIT_LIST_HEAD(&file.endbr_list); + INIT_LIST_HEAD(&file.call_list); file.ignore_unreachables = opts.no_unreachable; file.hints = false; -- cgit v1.2.3 From 0c0a6d8934e2081df93ba0bfc0cf615cc9c06988 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:10 +0200 Subject: objtool: Add --hacks=skylake Make the call/func sections selectable via the --hacks option. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.120821440@infradead.org --- scripts/Makefile.lib | 1 + tools/objtool/builtin-check.c | 7 ++++++- tools/objtool/check.c | 10 ++++++---- tools/objtool/include/objtool/builtin.h | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3aa384cec76b..85f02756dc9c 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -254,6 +254,7 @@ objtool := $(objtree)/tools/objtool/objtool objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr +objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount objtool-args-$(CONFIG_UNWINDER_ORC) += --orc diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 24fbe803a0d3..0a04f8ea4432 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -57,12 +57,17 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) found = true; } + if (!str || strstr(str, "skylake")) { + opts.hack_skylake = true; + found = true; + } + return found ? 0 : -1; } const struct option check_options[] = { OPT_GROUP("Actions:"), - OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr", "patch toolchain bugs/limitations", parse_hacks), + OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f578e030e8bb..1461c8894fb7 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4352,10 +4352,12 @@ int check(struct objtool_file *file) goto out; warnings += ret; - ret = create_direct_call_sections(file); - if (ret < 0) - goto out; - warnings += ret; + if (opts.hack_skylake) { + ret = create_direct_call_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } } if (opts.mcount) { diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 42a52f1a0add..22092a9f3cf6 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -14,6 +14,7 @@ struct opts { bool dump_orc; bool hack_jump_label; bool hack_noinstr; + bool hack_skylake; bool ibt; bool mcount; bool noinstr; -- cgit v1.2.3 From 5da6aea375cde499fdfac3cde4f26df4a840eb9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:12 +0200 Subject: objtool: Fix find_{symbol,func}_containing() The current find_{symbol,func}_containing() functions are broken in the face of overlapping symbols, exactly the case that is needed for a new ibt/endbr supression. Import interval_tree_generic.h into the tools tree and convert the symbol tree to an interval tree to support proper range stabs. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.330203761@infradead.org --- tools/include/linux/interval_tree_generic.h | 187 ++++++++++++++++++++++++++++ tools/objtool/elf.c | 93 ++++++-------- tools/objtool/include/objtool/elf.h | 3 +- 3 files changed, 229 insertions(+), 54 deletions(-) create mode 100644 tools/include/linux/interval_tree_generic.h diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h new file mode 100644 index 000000000000..aaa8a0767aa3 --- /dev/null +++ b/tools/include/linux/interval_tree_generic.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + + include/linux/interval_tree_generic.h +*/ + +#include + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoint of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + * + * Note - before using this, please consider if generic version + * (interval_tree.h) would work for you... + */ + +#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, \ + ITSTART, ITLAST, ITSTATIC, ITPREFIX) \ + \ +/* Callbacks for augmented rbtree insert and remove */ \ + \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ + \ +/* Insert / remove interval nodes from the tree */ \ + \ +ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, \ + struct rb_root_cached *root) \ +{ \ + struct rb_node **link = &root->rb_root.rb_node, *rb_parent = NULL; \ + ITTYPE start = ITSTART(node), last = ITLAST(node); \ + ITSTRUCT *parent; \ + bool leftmost = true; \ + \ + while (*link) { \ + rb_parent = *link; \ + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); \ + if (parent->ITSUBTREE < last) \ + parent->ITSUBTREE = last; \ + if (start < ITSTART(parent)) \ + link = &parent->ITRB.rb_left; \ + else { \ + link = &parent->ITRB.rb_right; \ + leftmost = false; \ + } \ + } \ + \ + node->ITSUBTREE = last; \ + rb_link_node(&node->ITRB, rb_parent, link); \ + rb_insert_augmented_cached(&node->ITRB, root, \ + leftmost, &ITPREFIX ## _augment); \ +} \ + \ +ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ + struct rb_root_cached *root) \ +{ \ + rb_erase_augmented_cached(&node->ITRB, root, &ITPREFIX ## _augment); \ +} \ + \ +/* \ + * Iterate over intervals intersecting [start;last] \ + * \ + * Note that a node's interval intersects [start;last] iff: \ + * Cond1: ITSTART(node) <= last \ + * and \ + * Cond2: start <= ITLAST(node) \ + */ \ + \ +static ITSTRUCT * \ +ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + while (true) { \ + /* \ + * Loop invariant: start <= node->ITSUBTREE \ + * (Cond2 is satisfied by one of the subtree nodes) \ + */ \ + if (node->ITRB.rb_left) { \ + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, \ + ITSTRUCT, ITRB); \ + if (start <= left->ITSUBTREE) { \ + /* \ + * Some nodes in left subtree satisfy Cond2. \ + * Iterate to find the leftmost such node N. \ + * If it also satisfies Cond1, that's the \ + * match we are looking for. Otherwise, there \ + * is no matching interval as nodes to the \ + * right of N can't satisfy Cond1 either. \ + */ \ + node = left; \ + continue; \ + } \ + } \ + if (ITSTART(node) <= last) { /* Cond1 */ \ + if (start <= ITLAST(node)) /* Cond2 */ \ + return node; /* node is leftmost match */ \ + if (node->ITRB.rb_right) { \ + node = rb_entry(node->ITRB.rb_right, \ + ITSTRUCT, ITRB); \ + if (start <= node->ITSUBTREE) \ + continue; \ + } \ + } \ + return NULL; /* No match */ \ + } \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_first(struct rb_root_cached *root, \ + ITTYPE start, ITTYPE last) \ +{ \ + ITSTRUCT *node, *leftmost; \ + \ + if (!root->rb_root.rb_node) \ + return NULL; \ + \ + /* \ + * Fastpath range intersection/overlap between A: [a0, a1] and \ + * B: [b0, b1] is given by: \ + * \ + * a0 <= b1 && b0 <= a1 \ + * \ + * ... where A holds the lock range and B holds the smallest \ + * 'start' and largest 'last' in the tree. For the later, we \ + * rely on the root node, which by augmented interval tree \ + * property, holds the largest value in its last-in-subtree. \ + * This allows mitigating some of the tree walk overhead for \ + * for non-intersecting ranges, maintained and consulted in O(1). \ + */ \ + node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB); \ + if (node->ITSUBTREE < start) \ + return NULL; \ + \ + leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB); \ + if (ITSTART(leftmost) > last) \ + return NULL; \ + \ + return ITPREFIX ## _subtree_search(node, start, last); \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + struct rb_node *rb = node->ITRB.rb_right, *prev; \ + \ + while (true) { \ + /* \ + * Loop invariants: \ + * Cond1: ITSTART(node) <= last \ + * rb == node->ITRB.rb_right \ + * \ + * First, search right subtree if suitable \ + */ \ + if (rb) { \ + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); \ + if (start <= right->ITSUBTREE) \ + return ITPREFIX ## _subtree_search(right, \ + start, last); \ + } \ + \ + /* Move up the tree until we come from a node's left child */ \ + do { \ + rb = rb_parent(&node->ITRB); \ + if (!rb) \ + return NULL; \ + prev = &node->ITRB; \ + node = rb_entry(rb, ITSTRUCT, ITRB); \ + rb = node->ITRB.rb_right; \ + } while (prev == rb); \ + \ + /* Check if the node intersects [start;last] */ \ + if (last < ITSTART(node)) /* !Cond1 */ \ + return NULL; \ + else if (start <= ITLAST(node)) /* Cond2 */ \ + return node; \ + } \ +} diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7e24b09b1163..89b37cd4ab1d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -50,38 +51,22 @@ static inline u32 str_hash(const char *str) __elf_table(name); \ }) -static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b) +static inline unsigned long __sym_start(struct symbol *s) { - struct symbol *sa = rb_entry(a, struct symbol, node); - struct symbol *sb = rb_entry(b, struct symbol, node); - - if (sa->offset < sb->offset) - return true; - if (sa->offset > sb->offset) - return false; - - if (sa->len < sb->len) - return true; - if (sa->len > sb->len) - return false; - - sa->alias = sb; - - return false; + return s->offset; } -static int symbol_by_offset(const void *key, const struct rb_node *node) +static inline unsigned long __sym_last(struct symbol *s) { - const struct symbol *s = rb_entry(node, struct symbol, node); - const unsigned long *o = key; + return s->offset + s->len - 1; +} - if (*o < s->offset) - return -1; - if (*o >= s->offset + s->len) - return 1; +INTERVAL_TREE_DEFINE(struct symbol, node, unsigned long, __subtree_last, + __sym_start, __sym_last, static, __sym) - return 0; -} +#define __sym_for_each(_iter, _tree, _start, _end) \ + for (_iter = __sym_iter_first((_tree), (_start), (_end)); \ + _iter; _iter = __sym_iter_next(_iter, (_start), (_end))) struct symbol_hole { unsigned long key; @@ -147,13 +132,12 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->offset == offset && s->type != STT_SECTION) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->offset == offset && iter->type != STT_SECTION) + return iter; } return NULL; @@ -161,13 +145,12 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { - struct rb_node *node; + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); - - if (s->offset == offset && s->type == STT_FUNC) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->offset == offset && iter->type == STT_FUNC) + return iter; } return NULL; @@ -175,13 +158,12 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->type != STT_SECTION) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->type != STT_SECTION) + return iter; } return NULL; @@ -202,7 +184,7 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) /* * Find the rightmost symbol for which @offset is after it. */ - n = rb_find(&hole, &sec->symbol_tree, symbol_hole_by_offset); + n = rb_find(&hole, &sec->symbol_tree.rb_root, symbol_hole_by_offset); /* found a symbol that contains @offset */ if (n) @@ -224,13 +206,12 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) struct symbol *find_func_containing(struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->type == STT_FUNC) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->type == STT_FUNC) + return iter; } return NULL; @@ -373,6 +354,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) { struct list_head *entry; struct rb_node *pnode; + struct symbol *iter; INIT_LIST_HEAD(&sym->pv_target); sym->alias = sym; @@ -386,7 +368,12 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; - rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); + __sym_for_each(iter, &sym->sec->symbol_tree, sym->offset, sym->offset) { + if (iter->offset == sym->offset && iter->type == sym->type) + iter->alias = sym; + } + + __sym_insert(sym, &sym->sec->symbol_tree); pnode = rb_prev(&sym->node); if (pnode) entry = &rb_entry(pnode, struct symbol, node)->list; @@ -401,7 +388,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) * can exist within a function, confusing the sorting. */ if (!sym->len) - rb_erase(&sym->node, &sym->sec->symbol_tree); + __sym_remove(sym, &sym->sec->symbol_tree); } static int read_symbols(struct elf *elf) diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index baa808583c4f..d28533106b78 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -30,7 +30,7 @@ struct section { struct hlist_node hash; struct hlist_node name_hash; GElf_Shdr sh; - struct rb_root symbol_tree; + struct rb_root_cached symbol_tree; struct list_head symbol_list; struct list_head reloc_list; struct section *base, *reloc; @@ -53,6 +53,7 @@ struct symbol { unsigned char bind, type; unsigned long offset; unsigned int len; + unsigned long __subtree_last; struct symbol *pfunc, *cfunc, *alias; u8 uaccess_safe : 1; u8 static_call_tramp : 1; -- cgit v1.2.3 From 08ef8c40112b8cd157515cd532f65cb82c934a76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:13 +0200 Subject: objtool: Allow symbol range comparisons for IBT/ENDBR A semi common pattern is where code checks if a code address is within a specific range. All text addresses require either ENDBR or ANNOTATE_ENDBR, however the ANNOTATE_NOENDBR past the range is unnatural. Instead, suppress this warning when this is exactly at the end of a symbol that itself starts with either ENDBR/ANNOTATE_ENDBR. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.434642471@infradead.org --- arch/x86/entry/entry_64_compat.S | 1 - tools/objtool/check.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 1dfee868d4a1..bc45ea7d08ee 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -128,7 +128,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) - ANNOTATE_NOENDBR // is_sysenter_singlestep SYM_CODE_END(entry_SYSENTER_compat) /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 1461c8894fb7..3f46f46ab85c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4033,6 +4033,24 @@ static void mark_endbr_used(struct instruction *insn) list_del_init(&insn->call_node); } +static bool noendbr_range(struct objtool_file *file, struct instruction *insn) +{ + struct symbol *sym = find_symbol_containing(insn->sec, insn->offset-1); + struct instruction *first; + + if (!sym) + return false; + + first = find_insn(file, sym->sec, sym->offset); + if (!first) + return false; + + if (first->type != INSN_ENDBR && !first->noendbr) + return false; + + return insn->offset == sym->offset + sym->len; +} + static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn) { struct instruction *dest; @@ -4105,9 +4123,19 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn continue; } + /* + * Accept anything ANNOTATE_NOENDBR. + */ if (dest->noendbr) continue; + /* + * Accept if this is the instruction after a symbol + * that is (no)endbr -- typical code-range usage. + */ + if (noendbr_range(file, dest)) + continue; + WARN_FUNC("relocation to !ENDBR: %s", insn->sec, insn->offset, offstr(dest->sec, dest->offset)); -- cgit v1.2.3 From dbcdbdfdf137b49144204571f1a5e5dc01b8aaad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Sep 2022 22:03:50 +0200 Subject: objtool: Rework instruction -> symbol mapping Currently insn->func contains a instruction -> symbol link for STT_FUNC symbols. A NULL value is assumed to mean STT_NOTYPE. However, there are also instructions not covered by any symbol at all. This can happen due to __weak symbols for example. Since the current scheme cannot differentiate between no symbol and STT_NOTYPE symbol, change things around. Make insn->sym point to any symbol type such that !insn->sym means no symbol and add a helper insn_func() that check the sym->type to retain the old functionality. This then prepares the way to add code that depends on the distinction between STT_NOTYPE and no symbol at all. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 105 ++++++++++++++++++---------------- tools/objtool/include/objtool/check.h | 12 +++- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f46f46ab85c..e532efb9b5ab 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -62,12 +62,12 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, struct instruction *insn) { struct instruction *next = list_next_entry(insn, list); - struct symbol *func = insn->func; + struct symbol *func = insn_func(insn); if (!func) return NULL; - if (&next->list != &file->insn_list && next->func == func) + if (&next->list != &file->insn_list && insn_func(next) == func) return next; /* Check if we're already in the subfunction: */ @@ -83,7 +83,7 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, { struct instruction *prev = list_prev_entry(insn, list); - if (&prev->list != &file->insn_list && prev->func == insn->func) + if (&prev->list != &file->insn_list && insn_func(prev) == insn_func(insn)) return prev; return NULL; @@ -133,7 +133,7 @@ static bool is_sibling_call(struct instruction *insn) * sibling call detection consistency between vmlinux.o and individual * objects. */ - if (!insn->func) + if (!insn_func(insn)) return false; /* An indirect jump is either a sibling call or a jump to a table. */ @@ -207,7 +207,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return false; insn = find_insn(file, func->sec, func->offset); - if (!insn->func) + if (!insn_func(insn)) return false; func_for_each_insn(file, func, insn) { @@ -243,7 +243,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return false; } - return __dead_end_function(file, dest->func, recursion+1); + return __dead_end_function(file, insn_func(dest), recursion+1); } } @@ -427,7 +427,10 @@ static int decode_instructions(struct objtool_file *file) } list_for_each_entry(func, &sec->symbol_list, list) { - if (func->type != STT_FUNC || func->alias != func) + if (func->type != STT_NOTYPE && func->type != STT_FUNC) + continue; + + if (func->return_thunk || func->alias != func) continue; if (!find_insn(file, sec, func->offset)) { @@ -437,9 +440,11 @@ static int decode_instructions(struct objtool_file *file) } sym_for_each_insn(file, func, insn) { - insn->func = func; - if (insn->type == INSN_ENDBR && list_empty(&insn->call_node)) { - if (insn->offset == insn->func->offset) { + insn->sym = func; + if (func->type == STT_FUNC && + insn->type == INSN_ENDBR && + list_empty(&insn->call_node)) { + if (insn->offset == func->offset) { list_add_tail(&insn->call_node, &file->endbr_list); file->nr_endbr++; } else { @@ -1397,19 +1402,19 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, static bool same_function(struct instruction *insn1, struct instruction *insn2) { - return insn1->func->pfunc == insn2->func->pfunc; + return insn_func(insn1)->pfunc == insn_func(insn2)->pfunc; } static bool is_first_func_insn(struct objtool_file *file, struct instruction *insn) { - if (insn->offset == insn->func->offset) + if (insn->offset == insn_func(insn)->offset) return true; if (opts.ibt) { struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == insn->func->offset + prev->len) + insn->offset == insn_func(insn)->offset + prev->len) return true; } @@ -1450,7 +1455,7 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->return_thunk) { add_return_call(file, insn, true); continue; - } else if (insn->func) { + } else if (insn_func(insn)) { /* * External sibling call or internal sibling call with * STT_FUNC reloc. @@ -1492,8 +1497,8 @@ static int add_jump_destinations(struct objtool_file *file) /* * Cross-function jump. */ - if (insn->func && jump_dest->func && - insn->func != jump_dest->func) { + if (insn_func(insn) && insn_func(jump_dest) && + insn_func(insn) != insn_func(jump_dest)) { /* * For GCC 8+, create parent/child links for any cold @@ -1510,10 +1515,10 @@ static int add_jump_destinations(struct objtool_file *file) * case where the parent function's only reference to a * subfunction is through a jump table. */ - if (!strstr(insn->func->name, ".cold") && - strstr(jump_dest->func->name, ".cold")) { - insn->func->cfunc = jump_dest->func; - jump_dest->func->pfunc = insn->func; + if (!strstr(insn_func(insn)->name, ".cold") && + strstr(insn_func(jump_dest)->name, ".cold")) { + insn_func(insn)->cfunc = insn_func(jump_dest); + insn_func(jump_dest)->pfunc = insn_func(insn); } else if (!same_function(insn, jump_dest) && is_first_func_insn(file, jump_dest)) { @@ -1521,7 +1526,7 @@ static int add_jump_destinations(struct objtool_file *file) * Internal sibling call without reloc or with * STT_SECTION reloc. */ - add_call_dest(file, insn, jump_dest->func, true); + add_call_dest(file, insn, insn_func(jump_dest), true); continue; } } @@ -1572,7 +1577,7 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - if (insn->func && insn->call_dest->type != STT_FUNC) { + if (insn_func(insn) && insn->call_dest->type != STT_FUNC) { WARN_FUNC("unsupported call to non-function", insn->sec, insn->offset); return -1; @@ -1668,7 +1673,7 @@ static int handle_group_alt(struct objtool_file *file, nop->offset = special_alt->new_off + special_alt->new_len; nop->len = special_alt->orig_len - special_alt->new_len; nop->type = INSN_NOP; - nop->func = orig_insn->func; + nop->sym = orig_insn->sym; nop->alt_group = new_alt_group; nop->ignore = orig_insn->ignore_alts; } @@ -1688,7 +1693,7 @@ static int handle_group_alt(struct objtool_file *file, last_new_insn = insn; insn->ignore = orig_insn->ignore_alts; - insn->func = orig_insn->func; + insn->sym = orig_insn->sym; insn->alt_group = new_alt_group; /* @@ -1882,7 +1887,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct reloc *reloc = table; struct instruction *dest_insn; struct alternative *alt; - struct symbol *pfunc = insn->func->pfunc; + struct symbol *pfunc = insn_func(insn)->pfunc; unsigned int prev_offset = 0; /* @@ -1909,7 +1914,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, break; /* Make sure the destination is in the same function: */ - if (!dest_insn->func || dest_insn->func->pfunc != pfunc) + if (!insn_func(dest_insn) || insn_func(dest_insn)->pfunc != pfunc) break; alt = malloc(sizeof(*alt)); @@ -1949,7 +1954,7 @@ static struct reloc *find_jump_table(struct objtool_file *file, * it. */ for (; - insn && insn->func && insn->func->pfunc == func; + insn && insn_func(insn) && insn_func(insn)->pfunc == func; insn = insn->first_jump_src ?: prev_insn_same_sym(file, insn)) { if (insn != orig_insn && insn->type == INSN_JUMP_DYNAMIC) @@ -1966,7 +1971,7 @@ static struct reloc *find_jump_table(struct objtool_file *file, if (!table_reloc) continue; dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend); - if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) + if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func) continue; return table_reloc; @@ -2415,6 +2420,13 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_{jump_call}_destination. + */ + ret = classify_symbols(file); + if (ret) + return ret; + ret = decode_instructions(file); if (ret) return ret; @@ -2433,13 +2445,6 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - /* - * Must be before add_{jump_call}_destination. - */ - ret = classify_symbols(file); - if (ret) - return ret; - /* * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. @@ -2648,7 +2653,7 @@ static int update_cfi_state(struct instruction *insn, /* stack operations don't make sense with an undefined CFA */ if (cfa->base == CFI_UNDEFINED) { - if (insn->func) { + if (insn_func(insn)) { WARN_FUNC("undefined stack state", insn->sec, insn->offset); return -1; } @@ -2994,7 +2999,7 @@ static int update_cfi_state(struct instruction *insn, } /* detect when asm code uses rbp as a scratch register */ - if (opts.stackval && insn->func && op->src.reg == CFI_BP && + if (opts.stackval && insn_func(insn) && op->src.reg == CFI_BP && cfa->base != CFI_BP) cfi->bp_scratch = true; break; @@ -3390,13 +3395,13 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, while (1) { next_insn = next_insn_to_validate(file, insn); - if (func && insn->func && func != insn->func->pfunc) { + if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6)) return 0; WARN("%s() falls through to next function %s()", - func->name, insn->func->name); + func->name, insn_func(insn)->name); return 1; } @@ -3638,7 +3643,7 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) while (&insn->list != &file->insn_list && (!sec || insn->sec == sec)) { if (insn->hint && !insn->visited && !insn->ignore) { - ret = validate_branch(file, insn->func, insn, state); + ret = validate_branch(file, insn_func(insn), insn, state); if (ret && opts.backtrace) BT_FUNC("<=== (hint)", insn); warnings += ret; @@ -3861,7 +3866,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio * In this case we'll find a piece of code (whole function) that is not * covered by a !section symbol. Ignore them. */ - if (opts.link && !insn->func) { + if (opts.link && !insn_func(insn)) { int size = find_symbol_hole_containing(insn->sec, insn->offset); unsigned long end = insn->offset + size; @@ -3885,10 +3890,10 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio /* * If this hole jumps to a .cold function, mark it ignore too. */ - if (insn->jump_dest && insn->jump_dest->func && - strstr(insn->jump_dest->func->name, ".cold")) { + if (insn->jump_dest && insn_func(insn->jump_dest) && + strstr(insn_func(insn->jump_dest)->name, ".cold")) { struct instruction *dest = insn->jump_dest; - func_for_each_insn(file, dest->func, dest) + func_for_each_insn(file, insn_func(dest), dest) dest->ignore = true; } } @@ -3896,10 +3901,10 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } - if (!insn->func) + if (!insn_func(insn)) return false; - if (insn->func->static_call_tramp) + if (insn_func(insn)->static_call_tramp) return true; /* @@ -3930,7 +3935,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio if (insn->type == INSN_JUMP_UNCONDITIONAL) { if (insn->jump_dest && - insn->jump_dest->func == insn->func) { + insn_func(insn->jump_dest) == insn_func(insn)) { insn = insn->jump_dest; continue; } @@ -3938,7 +3943,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio break; } - if (insn->offset + insn->len >= insn->func->offset + insn->func->len) + if (insn->offset + insn->len >= insn_func(insn)->offset + insn_func(insn)->len) break; insn = list_next_entry(insn, list); @@ -3967,7 +3972,7 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, state->uaccess = sym->uaccess_safe; - ret = validate_branch(file, insn->func, insn, *state); + ret = validate_branch(file, insn_func(insn), insn, *state); if (ret && opts.backtrace) BT_FUNC("<=== (sym)", insn); return ret; @@ -4104,7 +4109,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn continue; } - if (dest->func && dest->func == insn->func) { + if (insn_func(dest) && insn_func(dest) == insn_func(insn)) { /* * Anything from->to self is either _THIS_IP_ or * IRET-to-self. diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 036129cebeee..acd7fae59348 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -67,11 +67,21 @@ struct instruction { struct reloc *jump_table; struct reloc *reloc; struct list_head alts; - struct symbol *func; + struct symbol *sym; struct list_head stack_ops; struct cfi_state *cfi; }; +static inline struct symbol *insn_func(struct instruction *insn) +{ + struct symbol *sym = insn->sym; + + if (sym && sym->type != STT_FUNC) + sym = NULL; + + return sym; +} + #define VISITED_BRANCH 0x01 #define VISITED_BRANCH_UACCESS 0x02 #define VISITED_BRANCH_MASK 0x03 -- cgit v1.2.3 From 5a9c361a416fc3a3301e859ff09587cc1b933eb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jul 2022 11:49:50 +0200 Subject: objtool: Allow STT_NOTYPE -> STT_FUNC+0 sibling-calls Teach objtool about STT_NOTYPE -> STT_FUNC+0 sibling calls. Doing do allows slightly simpler .S files. There is a slight complication in that we specifically do not want to allow sibling calls from symbol holes (previously covered by STT_WEAK symbols) -- such things exist where a weak function has a .cold subfunction for example. Additionally, STT_NOTYPE tail-calls are allowed to happen with a modified stack frame, they don't need to obey the normal rules after all. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 74 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e532efb9b5ab..7936312e10c7 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -129,16 +129,13 @@ static bool is_jump_table_jump(struct instruction *insn) static bool is_sibling_call(struct instruction *insn) { /* - * Assume only ELF functions can make sibling calls. This ensures - * sibling call detection consistency between vmlinux.o and individual - * objects. + * Assume only STT_FUNC calls have jump-tables. */ - if (!insn_func(insn)) - return false; - - /* An indirect jump is either a sibling call or a jump to a table. */ - if (insn->type == INSN_JUMP_DYNAMIC) - return !is_jump_table_jump(insn); + if (insn_func(insn)) { + /* An indirect jump is either a sibling call or a jump to a table. */ + if (insn->type == INSN_JUMP_DYNAMIC) + return !is_jump_table_jump(insn); + } /* add_jump_destinations() sets insn->call_dest for sibling calls. */ return (is_static_jump(insn) && insn->call_dest); @@ -1400,27 +1397,50 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, list_add_tail(&insn->call_node, &file->return_thunk_list); } -static bool same_function(struct instruction *insn1, struct instruction *insn2) -{ - return insn_func(insn1)->pfunc == insn_func(insn2)->pfunc; -} - -static bool is_first_func_insn(struct objtool_file *file, struct instruction *insn) +static bool is_first_func_insn(struct objtool_file *file, + struct instruction *insn, struct symbol *sym) { - if (insn->offset == insn_func(insn)->offset) + if (insn->offset == sym->offset) return true; + /* Allow direct CALL/JMP past ENDBR */ if (opts.ibt) { struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == insn_func(insn)->offset + prev->len) + insn->offset == sym->offset + prev->len) return true; } return false; } +/* + * A sibling call is a tail-call to another symbol -- to differentiate from a + * recursive tail-call which is to the same symbol. + */ +static bool jump_is_sibling_call(struct objtool_file *file, + struct instruction *from, struct instruction *to) +{ + struct symbol *fs = from->sym; + struct symbol *ts = to->sym; + + /* Not a sibling call if from/to a symbol hole */ + if (!fs || !ts) + return false; + + /* Not a sibling call if not targeting the start of a symbol. */ + if (!is_first_func_insn(file, to, ts)) + return false; + + /* Disallow sibling calls into STT_NOTYPE */ + if (ts->type == STT_NOTYPE) + return false; + + /* Must not be self to be a sibling */ + return fs->pfunc != ts->pfunc; +} + /* * Find the destination instructions for all jumps. */ @@ -1519,18 +1539,18 @@ static int add_jump_destinations(struct objtool_file *file) strstr(insn_func(jump_dest)->name, ".cold")) { insn_func(insn)->cfunc = insn_func(jump_dest); insn_func(jump_dest)->pfunc = insn_func(insn); - - } else if (!same_function(insn, jump_dest) && - is_first_func_insn(file, jump_dest)) { - /* - * Internal sibling call without reloc or with - * STT_SECTION reloc. - */ - add_call_dest(file, insn, insn_func(jump_dest), true); - continue; } } + if (jump_is_sibling_call(file, insn, jump_dest)) { + /* + * Internal sibling call without reloc or with + * STT_SECTION reloc. + */ + add_call_dest(file, insn, insn_func(jump_dest), true); + continue; + } + insn->jump_dest = jump_dest; } @@ -3309,7 +3329,7 @@ static int validate_sibling_call(struct objtool_file *file, struct instruction *insn, struct insn_state *state) { - if (has_modified_stack_frame(insn, state)) { + if (insn_func(insn) && has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", insn->sec, insn->offset); return 1; -- cgit v1.2.3 From ef79ed20e3ae9ee9ac2e0f3a4e12814893972e63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:14 +0200 Subject: x86/entry: Make sync_regs() invocation a tail call No point in having a call there. Spare the call/ret overhead. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.539578813@infradead.org --- arch/x86/entry/entry_64.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c578a7dfcd7..b24b84b3425f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1062,11 +1062,8 @@ SYM_CODE_START_LOCAL(error_entry) UNTRAIN_RET leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ -.Lerror_entry_from_usermode_after_swapgs: - /* Put us onto the real thread stack. */ - call sync_regs - RET + jmp sync_regs /* * There are two places in the kernel that can potentially fault with @@ -1124,7 +1121,7 @@ SYM_CODE_START_LOCAL(error_entry) leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret mov %rax, %rdi - jmp .Lerror_entry_from_usermode_after_swapgs + jmp sync_regs SYM_CODE_END(error_entry) SYM_CODE_START_LOCAL(error_return) -- cgit v1.2.3 From cb855971d717a2dd752241f66fedad9dc178388c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:16 +0200 Subject: x86/putuser: Provide room for padding Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.746429822@infradead.org --- arch/x86/lib/putuser.S | 62 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index b7dfd60243b7..32125224fcca 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -47,8 +47,6 @@ SYM_FUNC_START(__put_user_1) LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) - ENDBR ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -56,54 +54,87 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) RET SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) + +SYM_FUNC_START(__put_user_nocheck_1) + ENDBR + ASM_STAC +2: movb %al,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) - ENDBR ASM_STAC -2: movw %ax,(%_ASM_CX) +3: movw %ax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) + +SYM_FUNC_START(__put_user_nocheck_2) + ENDBR + ASM_STAC +4: movw %ax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) - ENDBR ASM_STAC -3: movl %eax,(%_ASM_CX) +5: movl %eax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) + +SYM_FUNC_START(__put_user_nocheck_4) + ENDBR + ASM_STAC +6: movl %eax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) - ENDBR ASM_STAC -4: mov %_ASM_AX,(%_ASM_CX) +7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 -5: movl %edx,4(%_ASM_CX) +8: movl %edx,4(%_ASM_CX) #endif xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) + +SYM_FUNC_START(__put_user_nocheck_8) + ENDBR + ASM_STAC +9: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +10: movl %edx,4(%_ASM_CX) +#endif + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_put_user_clac) @@ -117,6 +148,11 @@ SYM_CODE_END(.Lbad_put_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) -#ifdef CONFIG_X86_32 _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) +#ifdef CONFIG_X86_32 + _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) #endif -- cgit v1.2.3 From 8f7c0d8b23c3f5f740a48db31ebadef28af17a22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:17 +0200 Subject: x86/Kconfig: Add CONFIG_CALL_THUNKS In preparation for mitigating the Intel SKL RSB underflow issue in software, add a new configuration symbol which allows to build the required call thunk infrastructure conditionally. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.849523555@infradead.org --- arch/x86/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f408fa87ed94..e18963e77cb1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2446,6 +2446,14 @@ config CC_HAS_SLS config CC_HAS_RETURN_THUNK def_bool $(cc-option,-mfunction-return=thunk-extern) +config HAVE_CALL_THUNKS + def_bool y + depends on RETHUNK && OBJTOOL + +config CALL_THUNKS + def_bool n + select FUNCTION_ALIGNMENT_16B + menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" default y -- cgit v1.2.3 From bea75b33895f7f87f0c40023e36a2d087e87ffa1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:18 +0200 Subject: x86/Kconfig: Introduce function padding Now that all functions are 16 byte aligned, add 16 bytes of NOP padding in front of each function. This prepares things for software call stack tracking and kCFI/FineIBT. This significantly increases kernel .text size, around 5.1% on a x86_64-defconfig-ish build. However, per the random access argument used for alignment, these 16 extra bytes are code that wouldn't be used. Performance measurements back this up by showing no significant performance regressions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.950884492@infradead.org --- arch/x86/Kconfig | 20 ++++++++++++++++- arch/x86/Makefile | 6 +++++ arch/x86/entry/vdso/Makefile | 3 ++- arch/x86/include/asm/linkage.h | 51 +++++++++++++++++++++++++++++++++++++++--- include/linux/bpf.h | 4 ++++ 5 files changed, 79 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e18963e77cb1..e368fc0daa4a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2446,9 +2446,27 @@ config CC_HAS_SLS config CC_HAS_RETURN_THUNK def_bool $(cc-option,-mfunction-return=thunk-extern) +config CC_HAS_ENTRY_PADDING + def_bool $(cc-option,-fpatchable-function-entry=16,16) + +config FUNCTION_PADDING_CFI + int + default 59 if FUNCTION_ALIGNMENT_64B + default 27 if FUNCTION_ALIGNMENT_32B + default 11 if FUNCTION_ALIGNMENT_16B + default 3 if FUNCTION_ALIGNMENT_8B + default 0 + +# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# except Kconfig can't do arithmetic :/ +config FUNCTION_PADDING_BYTES + int + default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_ALIGNMENT + config HAVE_CALL_THUNKS def_bool y - depends on RETHUNK && OBJTOOL + depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL config CALL_THUNKS def_bool n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 415a5d138de4..1640e005092b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,6 +208,12 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif +ifdef CONFIG_CALL_THUNKS +PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) +KBUILD_CFLAGS += $(PADDING_CFLAGS) +export PADDING_CFLAGS +endif + KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3ef611044c8f..838613ac15b8 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -95,7 +95,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -158,6 +158,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index c2d6e2733b11..45e0df850645 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -15,8 +15,19 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#define ASM_FUNC_ALIGN __ALIGN_STR -#define __FUNC_ALIGN __ALIGN +#if defined(CONFIG_CALL_THUNKS) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; +#else +#define FUNCTION_PADDING +#endif + +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO) +# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING +#else +# define __FUNC_ALIGN __ALIGN +#endif + +#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) #define SYM_F_ALIGN __FUNC_ALIGN #ifdef __ASSEMBLY__ @@ -45,11 +56,45 @@ #endif /* __ASSEMBLY__ */ +/* + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_THUNKS) the + * CFI symbol layout changes. + * + * Without CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .skip FUNCTION_PADDING, 0x90 + * .byte 0xb8 + * .long __kcfi_typeid_##name + * name: + * + * With CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .byte 0xb8 + * .long __kcfi_typeid_##name + * .skip FUNCTION_PADDING, 0x90 + * name: + * + * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. + */ + +#ifdef CONFIG_CALL_THUNKS +#define CFI_PRE_PADDING +#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#else +#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#define CFI_POST_PADDING +#endif + #define __CFI_TYPE(name) \ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ - .fill 11, 1, 0x90 ASM_NL \ + CFI_PRE_PADDING \ .byte 0xb8 ASM_NL \ .long __kcfi_typeid_##name ASM_NL \ + CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..5296aea9b5b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,11 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 +#ifdef CONFIG_CALL_THUNKS +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) +#else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#endif #else #define BPF_DISPATCHER_ATTRIBUTES #endif -- cgit v1.2.3 From 80e4c1cd42fff110bfdae8fce7ac4f22465f9664 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:19 +0200 Subject: x86/retbleed: Add X86_FEATURE_CALL_DEPTH Intel SKL CPUs fall back to other predictors when the RSB underflows. The only microcode mitigation is IBRS which is insanely expensive. It comes with performance drops of up to 30% depending on the workload. A way less expensive, but nevertheless horrible mitigation is to track the call depth in software and overeagerly fill the RSB when returns underflow the software counter. Provide a configuration symbol and a CPU misfeature bit. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.056176424@infradead.org --- arch/x86/Kconfig | 19 +++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 9 ++++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e368fc0daa4a..6ae7fa4b8eb7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2523,6 +2523,25 @@ config CPU_UNRET_ENTRY help Compile the kernel with support for the retbleed=unret mitigation. +config CALL_DEPTH_TRACKING + bool "Mitigate RSB underflow with call depth tracking" + depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select CALL_THUNKS + default y + help + Compile the kernel with call depth tracking to mitigate the Intel + SKL Return-Speculation-Buffer (RSB) underflow issue. The + mitigation is off by default and needs to be enabled on the + kernel command line via the retbleed=stuff option. For + non-affected systems the overhead of this option is marginal as + the call depth tracking is using run-time generated call thunks + in a compiler generated padding area and call patching. This + increases text size by ~5%. For non affected systems this space + is unused. On affected SKL systems this results in a significant + performance gain over the IBRS mitigation. + + config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b71f4f2ecdd5..aefd0816a333 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,6 +304,7 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_CALL_DEPTH (11*32+18) /* "" Call depth tracking for RSB stuffing */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33d2cd04d254..bbb03b25263e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -69,6 +69,12 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +# define DISABLE_CALL_DEPTH_TRACKING 0 +#else +# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -101,7 +107,8 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 -- cgit v1.2.3 From fe54d0793796ccdb213d8ea7bff0b49903b6afaa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:20 +0200 Subject: x86/alternatives: Provide text_poke_copy_locked() The upcoming call thunk patching must hold text_mutex and needs access to text_poke_copy(), which takes text_mutex. Provide a _locked postfixed variant to expose the inner workings. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.159977224@infradead.org --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 37 +++++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 1cc15528ce29..f4b87f08f5c5 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cadcea035e0..fad3c0e4838e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1236,27 +1236,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(text_poke_memcpy, addr, opcode, len); } -/** - * text_poke_copy - Copy instructions into (an unused part of) RX memory - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy, could be more than 2x PAGE_SIZE - * - * Not safe against concurrent execution; useful for JITs to dump - * new code blocks into unused regions of RX memory. Can be used in - * conjunction with synchronize_rcu_tasks() to wait for existing - * execution to quiesce after having made sure no existing functions - * pointers are live. - */ -void *text_poke_copy(void *addr, const void *opcode, size_t len) +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; - if (WARN_ON_ONCE(core_kernel_text(start))) + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; - mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; @@ -1266,6 +1254,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } -- cgit v1.2.3 From c22cf380c79c4bb0e502b0343f57271b17626424 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:21 +0200 Subject: x86/entry: Make some entry symbols global paranoid_entry(), error_entry() and xen_error_entry() have to be exempted from call accounting by thunk patching because they are before UNTRAIN_RET. Expose them so they are available in the alternative code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.265598113@infradead.org --- arch/x86/entry/entry_64.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b24b84b3425f..4cc0125fdfdc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -327,7 +327,8 @@ SYM_CODE_END(ret_from_fork) #endif .endm -SYM_CODE_START_LOCAL(xen_error_entry) +SYM_CODE_START(xen_error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -906,7 +907,8 @@ SYM_CODE_END(xen_failsafe_callback) * R14 - old CR3 * R15 - old SPEC_CTRL */ -SYM_CODE_START_LOCAL(paranoid_entry) +SYM_CODE_START(paranoid_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -1041,7 +1043,8 @@ SYM_CODE_END(paranoid_exit) /* * Switch GS and CR3 if needed. */ -SYM_CODE_START_LOCAL(error_entry) +SYM_CODE_START(error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 -- cgit v1.2.3 From 239f2e248ef12840178a3ed1a217f19b5fbfde26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:22 +0200 Subject: x86/paravirt: Make struct paravirt_call_site unconditionally available For the upcoming call thunk patching it's less ifdeffery when the data structure is unconditionally available. The code can then be trivially fenced off with IS_ENABLED(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.367853167@infradead.org --- arch/x86/include/asm/paravirt.h | 4 ++-- arch/x86/include/asm/paravirt_types.h | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1be66c15ecbd..2851bc2339d5 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -4,13 +4,13 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include + #ifdef CONFIG_PARAVIRT #include #include #include -#include - #ifndef __ASSEMBLY__ #include #include diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f3d601574730..e137d9412123 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,6 +2,17 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H +#ifndef __ASSEMBLY__ +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ + u8 len; /* length of original instruction */ +}; +#endif + +#ifdef CONFIG_PARAVIRT + /* Bitmask of what can be clobbered: usually at least eax. */ #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) @@ -593,16 +604,9 @@ unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #endif /* __ASSEMBLY__ */ - +#endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ -- cgit v1.2.3 From e81dc127ef69887c72735a3e3868930e2bf313ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:23 +0200 Subject: x86/callthunks: Add call patching for call depth tracking Mitigating the Intel SKL RSB underflow issue in software requires to track the call depth. That is every CALL and every RET need to be intercepted and additional code injected. The existing retbleed mitigations already include means of redirecting RET to __x86_return_thunk; this can be re-purposed and RET can be redirected to another function doing RET accounting. CALL accounting will use the function padding introduced in prior patches. For each CALL instruction, the destination symbol's padding is rewritten to do the accounting and the CALL instruction is adjusted to call into the padding. This ensures only affected CPUs pay the overhead of this accounting. Unaffected CPUs will leave the padding unused and have their 'JMP __x86_return_thunk' replaced with an actual 'RET' instruction. Objtool has been modified to supply a .call_sites section that lists all the 'CALL' instructions. Additionally the paravirt instruction sites are iterated since they will have been patched from an indirect call to direct calls (or direct instructions in which case it'll be ignored). Module handling and the actual thunk code for SKL will be added in subsequent steps. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.470877038@infradead.org --- arch/x86/Kconfig | 12 ++ arch/x86/include/asm/alternative.h | 12 ++ arch/x86/kernel/Makefile | 2 + arch/x86/kernel/alternative.c | 6 + arch/x86/kernel/callthunks.c | 251 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/head_64.S | 1 + arch/x86/kernel/relocate_kernel_64.S | 5 +- arch/x86/kernel/vmlinux.lds.S | 8 -- 8 files changed, 287 insertions(+), 10 deletions(-) create mode 100644 arch/x86/kernel/callthunks.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6ae7fa4b8eb7..a1dae9d5e3da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2541,6 +2541,18 @@ config CALL_DEPTH_TRACKING is unused. On affected SKL systems this results in a significant performance gain over the IBRS mitigation. +config CALL_THUNKS_DEBUG + bool "Enable call thunks and call depth tracking debugging" + depends on CALL_DEPTH_TRACKING + select FUNCTION_ALIGNMENT_32B + default n + help + Enable call/ret counters for imbalance detection and build in + a noisy dmesg about callthunks generation and call patching for + trouble shooting. The debug prints need to be enabled on the + kernel command line with 'debug-callthunks'. + Only enable this, when you are debugging call thunks as this + creates a noticable runtime overhead. If unsure say N. config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546..6b7bbd0db248 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -80,6 +80,18 @@ extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); struct module; +struct paravirt_patch_site; + +struct callthunk_sites { + s32 *call_start, *call_end; + struct paravirt_patch_site *pv_start, *pv_end; +}; + +#ifdef CONFIG_CALL_THUNKS +extern void callthunks_patch_builtin_calls(void); +#else +static __always_inline void callthunks_patch_builtin_calls(void) {} +#endif #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c..c2739a5886fa 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CALL_THUNKS) += callthunks.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index fad3c0e4838e..963872d17707 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -947,6 +947,12 @@ void __init alternative_instructions(void) */ apply_alternatives(__alt_instructions, __alt_instructions_end); + /* + * Now all calls are established. Apply the call thunks if + * required. + */ + callthunks_patch_builtin_calls(); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c new file mode 100644 index 000000000000..e5275d6e674d --- /dev/null +++ b/arch/x86/kernel/callthunks.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "callthunks: " fmt + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int __initdata_or_module debug_callthunks; + +#define prdbg(fmt, args...) \ +do { \ + if (debug_callthunks) \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ +} while(0) + +static int __init debug_thunks(char *str) +{ + debug_callthunks = 1; + return 1; +} +__setup("debug-callthunks", debug_thunks); + +extern s32 __call_sites[], __call_sites_end[]; + +struct thunk_desc { + void *template; + unsigned int template_size; +}; + +struct core_text { + unsigned long base; + unsigned long end; + const char *name; +}; + +static bool thunks_initialized __ro_after_init; + +static const struct core_text builtin_coretext = { + .base = (unsigned long)_text, + .end = (unsigned long)_etext, + .name = "builtin", +}; + +static struct thunk_desc callthunk_desc __ro_after_init; + +extern void error_entry(void); +extern void xen_error_entry(void); +extern void paranoid_entry(void); + +static inline bool within_coretext(const struct core_text *ct, void *addr) +{ + unsigned long p = (unsigned long)addr; + + return ct->base <= p && p < ct->end; +} + +static inline bool within_module_coretext(void *addr) +{ + bool ret = false; + +#ifdef CONFIG_MODULES + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_core((unsigned long)addr, mod)) + ret = true; + preempt_enable(); +#endif + return ret; +} + +static bool is_coretext(const struct core_text *ct, void *addr) +{ + if (ct && within_coretext(ct, addr)) + return true; + if (within_coretext(&builtin_coretext, addr)) + return true; + return within_module_coretext(addr); +} + +static __init_or_module bool skip_addr(void *dest) +{ + if (dest == error_entry) + return true; + if (dest == paranoid_entry) + return true; + if (dest == xen_error_entry) + return true; + /* Does FILL_RSB... */ + if (dest == __switch_to_asm) + return true; + /* Accounts directly */ + if (dest == ret_from_fork) + return true; +#ifdef CONFIG_HOTPLUG_CPU + if (dest == start_cpu0) + return true; +#endif +#ifdef CONFIG_FUNCTION_TRACER + if (dest == __fentry__) + return true; +#endif +#ifdef CONFIG_KEXEC_CORE + if (dest >= (void *)relocate_kernel && + dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) + return true; +#endif +#ifdef CONFIG_XEN + if (dest >= (void *)hypercall_page && + dest < (void*)hypercall_page + PAGE_SIZE) + return true; +#endif + return false; +} + +static __init_or_module void *call_get_dest(void *addr) +{ + struct insn insn; + void *dest; + int ret; + + ret = insn_decode_kernel(&insn, addr); + if (ret) + return ERR_PTR(ret); + + /* Patched out call? */ + if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) + return NULL; + + dest = addr + insn.length + insn.immediate.value; + if (skip_addr(dest)) + return NULL; + return dest; +} + +static const u8 nops[] = { + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, +}; + +static __init_or_module void *patch_dest(void *dest, bool direct) +{ + unsigned int tsize = callthunk_desc.template_size; + u8 *pad = dest - tsize; + + /* Already patched? */ + if (!bcmp(pad, callthunk_desc.template, tsize)) + return pad; + + /* Ensure there are nops */ + if (bcmp(pad, nops, tsize)) { + pr_warn_once("Invalid padding area for %pS\n", dest); + return NULL; + } + + if (direct) + memcpy(pad, callthunk_desc.template, tsize); + else + text_poke_copy_locked(pad, callthunk_desc.template, tsize, true); + return pad; +} + +static __init_or_module void patch_call(void *addr, const struct core_text *ct) +{ + void *pad, *dest; + u8 bytes[8]; + + if (!within_coretext(ct, addr)) + return; + + dest = call_get_dest(addr); + if (!dest || WARN_ON_ONCE(IS_ERR(dest))) + return; + + if (!is_coretext(ct, dest)) + return; + + pad = patch_dest(dest, within_coretext(ct, dest)); + if (!pad) + return; + + prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, + dest, dest, pad); + __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); + text_poke_early(addr, bytes, CALL_INSN_SIZE); +} + +static __init_or_module void +patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) +{ + s32 *s; + + for (s = start; s < end; s++) + patch_call((void *)s + *s, ct); +} + +static __init_or_module void +patch_paravirt_call_sites(struct paravirt_patch_site *start, + struct paravirt_patch_site *end, + const struct core_text *ct) +{ + struct paravirt_patch_site *p; + + for (p = start; p < end; p++) + patch_call(p->instr, ct); +} + +static __init_or_module void +callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) +{ + prdbg("Patching call sites %s\n", ct->name); + patch_call_sites(cs->call_start, cs->call_end, ct); + patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + prdbg("Patching call sites done%s\n", ct->name); +} + +void __init callthunks_patch_builtin_calls(void) +{ + struct callthunk_sites cs = { + .call_start = __call_sites, + .call_end = __call_sites_end, + .pv_start = __parainstructions, + .pv_end = __parainstructions_end + }; + + if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return; + + pr_info("Setting up call depth tracking\n"); + mutex_lock(&text_mutex); + callthunks_setup(&cs, &builtin_coretext); + thunks_initialized = true; + mutex_unlock(&text_mutex); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d860d437631b..222efd4a09bc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -370,6 +370,7 @@ SYM_CODE_END(secondary_startup_64) * start_secondary() via .Ljump_to_C_code. */ SYM_CODE_START(start_cpu0) + ANNOTATE_NOENDBR UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4809c0dc4eb0..4a73351f87f8 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -41,6 +41,7 @@ .text .align PAGE_SIZE .code64 +SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR @@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) int3 SYM_CODE_END(swap_pages) - .globl kexec_control_code_size -.set kexec_control_code_size, . - relocate_kernel + .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc +SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b69df9e013cc..49f3f86433c7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -501,11 +501,3 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_KEXEC_CORE -#include - -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); -#endif - -- cgit v1.2.3 From eaf44c816ed8d1ef94c354e3ed47d53cd5a5cb13 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:24 +0200 Subject: x86/modules: Add call patching As for the builtins create call thunks and patch the call sites to call the thunk on Intel SKL CPUs for retbleed mitigation. Note, that module init functions are ignored for sake of simplicity because loading modules is not something which is done in high frequent loops and the attacker has not really a handle on when this happens in order to launch a matching attack. The depth tracking will still work for calls into the builtins and because the call is not accounted it will underflow faster and overstuff, but that's mitigated by the saturating counter and the side effect is only temporary. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.575673066@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 19 +++++++++++++++++++ arch/x86/kernel/module.c | 20 +++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 6b7bbd0db248..ef007fa33dc4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -89,8 +89,13 @@ struct callthunk_sites { #ifdef CONFIG_CALL_THUNKS extern void callthunks_patch_builtin_calls(void); +extern void callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod); #else static __always_inline void callthunks_patch_builtin_calls(void) {} +static __always_inline void +callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod) {} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index e5275d6e674d..7b9d998ebd7d 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -249,3 +249,22 @@ void __init callthunks_patch_builtin_calls(void) thunks_initialized = true; mutex_unlock(&text_mutex); } + +#ifdef CONFIG_MODULES +void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, + struct module *mod) +{ + struct core_text ct = { + .base = (unsigned long)mod->core_layout.base, + .end = (unsigned long)mod->core_layout.base + mod->core_layout.size, + .name = mod->name, + }; + + if (!thunks_initialized) + return; + + mutex_lock(&text_mutex); + callthunks_setup(cs, &ct); + mutex_unlock(&text_mutex); +} +#endif /* CONFIG_MODULES */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 43f011277219..2fb9de2cef40 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -254,7 +254,8 @@ int module_finalize(const Elf_Ehdr *hdr, { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -274,6 +275,8 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; if (!strcmp(".return_sites", secstrings + s->sh_name)) returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -299,6 +302,21 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); -- cgit v1.2.3 From 770ae1b709528a6a173b5c7b183818ee9b45e376 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:25 +0200 Subject: x86/returnthunk: Allow different return thunks In preparation for call depth tracking on Intel SKL CPUs, make it possible to patch in a SKL specific return thunk. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.680469665@infradead.org --- arch/x86/include/asm/nospec-branch.h | 6 ++++++ arch/x86/kernel/alternative.c | 17 +++++++++++++---- arch/x86/kernel/ftrace.c | 2 +- arch/x86/kernel/static_call.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c936ce9f0c47..f10ca334dd75 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -208,6 +208,12 @@ extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_CALL_THUNKS +extern void (*x86_return_thunk)(void); +#else +#define x86_return_thunk (&__x86_return_thunk) +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 963872d17707..04d1e3d35b0e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -518,6 +518,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } #ifdef CONFIG_RETHUNK + +#ifdef CONFIG_CALL_THUNKS +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +#endif + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -533,14 +538,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - return -1; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (x86_return_thunk == __x86_return_thunk) + return -1; - bytes[i++] = RET_INSN_OPCODE; + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + bytes[i++] = RET_INSN_OPCODE; + } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; - return i; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 00eac455a3a1..4ac6692d5ef8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -359,7 +359,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index aaaba85d6d7f..5d3844a98373 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -52,7 +52,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case RET: if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad78..0df391ecd4d8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -432,7 +432,7 @@ static void emit_return(u8 **pprog, u8 *ip) u8 *prog = *pprog; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - emit_jump(&prog, &__x86_return_thunk, ip); + emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ if (IS_ENABLED(CONFIG_SLS)) -- cgit v1.2.3 From 52354973573cc260ff2fc661cb28ff8eaa7b879b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:26 +0200 Subject: x86/asm: Provide ALTERNATIVE_3 Fairly straight forward adaptation/extention of ALTERNATIVE_2. Required for call depth tracking. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.787711192@infradead.org --- arch/x86/include/asm/alternative.h | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ef007fa33dc4..4c416b21bac8 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -364,6 +364,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define old_len 141b-140b #define new_len1 144f-143f #define new_len2 145f-144f +#define new_len3 146f-145f /* * gas compatible max based on the idea from: @@ -371,7 +372,8 @@ static inline int alternatives_text_reserved(void *start, void *end) * * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c)) /* @@ -383,8 +385,8 @@ static inline int alternatives_text_reserved(void *start, void *end) 140: \oldinstr 141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 + .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_2(new_len1, new_len2) - (old_len)),0x90 142: .pushsection .altinstructions,"a" @@ -401,6 +403,31 @@ static inline int alternatives_text_reserved(void *start, void *end) .popsection .endm +.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3 +140: + \oldinstr +141: + .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \ + (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + \newinstr3 +146: + .popsection +.endm + /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ #define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ -- cgit v1.2.3 From 5d8213864ade86b48fc492584ea86d65a62f892e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:27 +0200 Subject: x86/retbleed: Add SKL return thunk To address the Intel SKL RSB underflow issue in software it's required to do call depth tracking. Provide a return thunk for call depth tracking on Intel SKL CPUs. The tracking does not use a counter. It uses uses arithmetic shift right on call entry and logical shift left on return. The depth tracking variable is initialized to 0x8000.... when the call depth is zero. The arithmetic shift right sign extends the MSB and saturates after the 12th call. The shift count is 5 so the tracking covers 12 nested calls. On return the variable is shifted left logically so it becomes zero again. CALL RET 0: 0x8000000000000000 0x0000000000000000 1: 0xfc00000000000000 0xf000000000000000 ... 11: 0xfffffffffffffff8 0xfffffffffffffc00 12: 0xffffffffffffffff 0xffffffffffffffe0 After a return buffer fill the depth is credited 12 calls before the next stuffing has to take place. There is a inaccuracy for situations like this: 10 calls 5 returns 3 calls 4 returns 3 calls .... The shift count might cause this to be off by one in either direction, but there is still a cushion vs. the RSB depth. The algorithm does not claim to be perfect, but it should obfuscate the problem enough to make exploitation extremly difficult. The theory behind this is: RSB is a stack with depth 16 which is filled on every call. On the return path speculation "pops" entries to speculate down the call chain. Once the speculative RSB is empty it switches to other predictors, e.g. the Branch History Buffer, which can be mistrained by user space and misguide the speculation path to a gadget. Call depth tracking is designed to break this speculation path by stuffing speculation trap calls into the RSB which are never getting a corresponding return executed. This stalls the prediction path until it gets resteered, The assumption is that stuffing at the 12th return is sufficient to break the speculation before it hits the underflow and the fallback to the other predictors. Testing confirms that it works. Johannes, one of the retbleed researchers. tried to attack this approach but failed. There is obviously no scientific proof that this will withstand future research progress, but all we can do right now is to speculate about it. The SAR/SHL usage was suggested by Andi Kleen. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.890071690@infradead.org --- arch/x86/entry/entry_64.S | 10 +-- arch/x86/include/asm/current.h | 3 + arch/x86/include/asm/nospec-branch.h | 121 +++++++++++++++++++++++++++++++++-- arch/x86/kernel/asm-offsets.c | 3 + arch/x86/kvm/svm/vmenter.S | 1 + arch/x86/lib/retpoline.S | 31 +++++++++ 6 files changed, 159 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cc0125fdfdc..15739a2c0983 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,6 +288,7 @@ SYM_FUNC_END(__switch_to_asm) SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread + CALL_DEPTH_ACCOUNT movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -332,7 +333,7 @@ SYM_CODE_START(xen_error_entry) UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(xen_error_entry) @@ -977,7 +978,7 @@ SYM_CODE_START(paranoid_entry) * CR3 above, keep the old value in a callee saved register. */ IBRS_ENTER save_reg=%r15 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(paranoid_entry) @@ -1062,7 +1063,7 @@ SYM_CODE_START(error_entry) /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ /* Put us onto the real thread stack. */ @@ -1097,6 +1098,7 @@ SYM_CODE_START(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ ANNOTATE_UNRET_END RET @@ -1115,7 +1117,7 @@ SYM_CODE_START(error_entry) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL /* * Pretend that the exception came from user mode: set up pt_regs diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index b89aba077b84..a1168e7b69e5 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,9 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; +#ifdef CONFIG_CALL_DEPTH_TRACKING + u64 call_depth; +#endif unsigned long top_of_stack; void *hardirq_stack_ptr; u16 softirq_pending; diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f10ca334dd75..d4be826a2282 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,8 +12,83 @@ #include #include #include +#include -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremly difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define ASM_CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH \ + mov $0x80, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + mov $0xfc, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define INCREMENT_CALL_DEPTH \ + sarq $5, %gs:pcpu_hot + X86_call_depth; + +#define ASM_INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#else +#define CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define INCREMENT_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#endif /* * Fill the CPU return stack buffer. @@ -32,6 +107,7 @@ * from C via asm(".include ") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* @@ -60,7 +136,8 @@ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ - lfence; + lfence; \ + ASM_CREDIT_CALL_DEPTH #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -185,11 +262,32 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) ANNOTATE_UNRET_END - ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +.macro UNTRAIN_RET_FROM_CALL +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) + ANNOTATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH +#endif +.endm + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -214,6 +312,17 @@ extern void (*x86_return_thunk)(void); #define x86_return_thunk (&__x86_return_thunk) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __x86_return_skl(void); + +static inline void x86_set_skl_return_thunk(void) +{ + x86_return_thunk = &__x86_return_skl; +} +#else +static inline void x86_set_skl_return_thunk(void) {} +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a9824318e1c5..13afdbbee349 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -110,6 +110,9 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 723f8534986c..09eacf19d718 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include #include #include diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 073289a55f84..1e79eccc1d69 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include .section .text.__x86.indirect_thunk @@ -140,3 +142,32 @@ __EXPORT_THUNK(zen_untrain_ret) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ + +#ifdef CONFIG_CALL_DEPTH_TRACKING + + .align 64 +SYM_FUNC_START(__x86_return_skl) + ANNOTATE_NOENDBR + /* Keep the hotpath in a 16byte I-fetch */ + shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + jz 1f + ANNOTATE_UNRET_SAFE + ret + int3 +1: + .rept 16 + ANNOTATE_INTRA_FUNCTION_CALL + call 2f + int3 +2: + .endr + add $(8*16), %rsp + + CREDIT_CALL_DEPTH + + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(__x86_return_skl) + +#endif /* CONFIG_CALL_DEPTH_TRACKING */ -- cgit v1.2.3 From 3b6c1747da48ff40ab746b0e860cffe83619f5c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:28 +0200 Subject: x86/retpoline: Add SKL retthunk retpolines Ensure that retpolines do the proper call accounting so that the return accounting works correctly. Specifically; retpolines are used to replace both 'jmp *%reg' and 'call *%reg', however these two cases do not have the same accounting requirements. Therefore split things up and provide two different retpoline arrays for SKL. The 'jmp *%reg' case needs no accounting, the __x86_indirect_jump_thunk_array[] covers this. The retpoline is changed to not use the return thunk; it's a simple call;ret construct. [ strictly speaking it should do: andq $(~0x1f), PER_CPU_VAR(__x86_call_depth) but we can argue this can be covered by the fuzz we already have in the accounting depth (12) vs the RSB depth (16) ] The 'call *%reg' case does need accounting, the __x86_indirect_call_thunk_array[] covers this. Again, this retpoline avoids the use of the return-thunk, in this case to avoid double accounting. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.996634749@infradead.org --- arch/x86/include/asm/nospec-branch.h | 12 ++++++ arch/x86/kernel/alternative.c | 59 ++++++++++++++++++++++++++++-- arch/x86/lib/retpoline.S | 71 ++++++++++++++++++++++++++++++++---- arch/x86/net/bpf_jit_comp.c | 5 ++- 4 files changed, 135 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d4be826a2282..06ba7caa0cad 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -301,6 +301,8 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); @@ -330,6 +332,16 @@ static inline void x86_set_skl_return_thunk(void) {} #include #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include +#undef GEN + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 04d1e3d35b0e..19221d77dc27 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -377,6 +377,56 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN("%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +459,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +481,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 1e79eccc1d69..e00206077ae9 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -14,17 +14,18 @@ .section .text.__x86.indirect_thunk -.macro RETPOLINE reg + +.macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ -.Lspec_trap_\@: - UNWIND_HINT_EMPTY - pause - lfence - jmp .Lspec_trap_\@ + int3 .Ldo_rop_\@: mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC +.endm + +.macro RETPOLINE reg + POLINE \reg RET .endm @@ -54,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) */ #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) .align RETPOLINE_THUNK_SIZE SYM_CODE_START(__x86_indirect_thunk_array) @@ -66,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_CODE_END(__x86_indirect_thunk_array) -#define GEN(reg) EXPORT_THUNK(reg) +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#include +#undef GEN + +#ifdef CONFIG_CALL_DEPTH_TRACKING +.macro CALL_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + + CALL_DEPTH_ACCOUNT + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_call_thunk_array) + +#define GEN(reg) CALL_THUNK reg #include #undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_call_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg) +#include +#undef GEN + +.macro JUMP_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_jump_thunk_array) + +#define GEN(reg) JUMP_THUNK reg +#include +#undef GEN + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_jump_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) +#include +#undef GEN +#endif /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0df391ecd4d8..ad8cb7f15ab8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -417,7 +417,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + else + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) -- cgit v1.2.3 From bbaceb189a21d7245e8063701fe10985396028f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:29 +0200 Subject: x86/retbleed: Add SKL call thunk Add the actual SKL call thunk for call depth accounting. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.101125588@infradead.org --- arch/x86/kernel/callthunks.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7b9d998ebd7d..01f6f6b5a93c 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -55,7 +56,21 @@ static const struct core_text builtin_coretext = { .name = "builtin", }; -static struct thunk_desc callthunk_desc __ro_after_init; +asm ( + ".pushsection .rodata \n" + ".global skl_call_thunk_template \n" + "skl_call_thunk_template: \n" + __stringify(INCREMENT_CALL_DEPTH)" \n" + ".global skl_call_thunk_tail \n" + "skl_call_thunk_tail: \n" + ".popsection \n" +); + +extern u8 skl_call_thunk_template[]; +extern u8 skl_call_thunk_tail[]; + +#define SKL_TMPL_SIZE \ + ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) extern void error_entry(void); extern void xen_error_entry(void); @@ -157,11 +172,11 @@ static const u8 nops[] = { static __init_or_module void *patch_dest(void *dest, bool direct) { - unsigned int tsize = callthunk_desc.template_size; + unsigned int tsize = SKL_TMPL_SIZE; u8 *pad = dest - tsize; /* Already patched? */ - if (!bcmp(pad, callthunk_desc.template, tsize)) + if (!bcmp(pad, skl_call_thunk_template, tsize)) return pad; /* Ensure there are nops */ @@ -171,9 +186,9 @@ static __init_or_module void *patch_dest(void *dest, bool direct) } if (direct) - memcpy(pad, callthunk_desc.template, tsize); + memcpy(pad, skl_call_thunk_template, tsize); else - text_poke_copy_locked(pad, callthunk_desc.template, tsize, true); + text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); return pad; } -- cgit v1.2.3 From f5c1bb2afe93396d41c5cbdcb909b08a75b8dde4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:30 +0200 Subject: x86/calldepth: Add ret/call counting for debug Add a debuigfs mechanism to validate the accounting, e.g. vs. call/ret balance and to gather statistics about the stuffing to call ratio. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.204285506@infradead.org --- arch/x86/include/asm/nospec-branch.h | 36 +++++++++++++++++++++--- arch/x86/kernel/callthunks.c | 53 ++++++++++++++++++++++++++++++++++++ arch/x86/lib/retpoline.S | 7 ++++- 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 06ba7caa0cad..4771147c7c5a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -57,6 +57,22 @@ #define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL #define RET_DEPTH_CREDIT 0xffffffffffffffffULL +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq %gs:__x86_call_count; +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq %gs:__x86_ret_count; +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq %gs:__x86_stuffs_count; +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq %gs:__x86_ctxsw_count; +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include @@ -75,18 +91,23 @@ #define RESET_CALL_DEPTH_FROM_CALL \ mov $0xfc, %rax; \ shl $56, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; + sarq $5, %gs:pcpu_hot + X86_call_depth; \ + CALL_THUNKS_DEBUG_INC_CALLS #define ASM_INCREMENT_CALL_DEPTH \ - sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH +#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH #define INCREMENT_CALL_DEPTH +#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL #endif @@ -137,7 +158,8 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH + ASM_CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -321,6 +343,12 @@ static inline void x86_set_skl_return_thunk(void) { x86_return_thunk = &__x86_return_skl; } +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif #else static inline void x86_set_skl_return_thunk(void) {} #endif diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 01f6f6b5a93c..dfe7ffff88b9 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) "callthunks: " fmt +#include #include #include #include @@ -35,6 +36,15 @@ static int __init debug_thunks(char *str) } __setup("debug-callthunks", debug_thunks); +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + extern s32 __call_sites[], __call_sites_end[]; struct thunk_desc { @@ -283,3 +293,46 @@ void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, mutex_unlock(&text_mutex); } #endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index e00206077ae9..5f61c65322be 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -203,13 +203,18 @@ EXPORT_SYMBOL(__x86_return_thunk) .align 64 SYM_FUNC_START(__x86_return_skl) ANNOTATE_NOENDBR - /* Keep the hotpath in a 16byte I-fetch */ + /* + * Keep the hotpath in a 16byte I-fetch for the non-debug + * case. + */ + CALL_THUNKS_DEBUG_INC_RETS shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) jz 1f ANNOTATE_UNRET_SAFE ret int3 1: + CALL_THUNKS_DEBUG_INC_STUFFS .rept 16 ANNOTATE_INTRA_FUNCTION_CALL call 2f -- cgit v1.2.3 From 7825451fa4dc04660f1f53d236e4302161d0ebd1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:31 +0200 Subject: static_call: Add call depth tracking support When indirect calls are switched to direct calls then it has to be ensured that the call target is not the function, but the call thunk when call depth tracking is enabled. But static calls are available before call thunks have been set up. Ensure a second run through the static call patching code after call thunks have been created. When call thunks are not enabled this has no side effects. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.306100465@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 18 ++++++++++++++++++ arch/x86/kernel/static_call.c | 1 + include/linux/static_call.h | 2 ++ kernel/static_call_inline.c | 23 ++++++++++++++++++----- 5 files changed, 44 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c416b21bac8..07ac25793a3f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -91,11 +91,16 @@ struct callthunk_sites { extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); +extern void *callthunks_translate_call_dest(void *dest); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod) {} +static __always_inline void *callthunks_translate_call_dest(void *dest) +{ + return dest; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index dfe7ffff88b9..071003605a86 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -271,10 +272,27 @@ void __init callthunks_patch_builtin_calls(void) pr_info("Setting up call depth tracking\n"); mutex_lock(&text_mutex); callthunks_setup(&cs, &builtin_coretext); + static_call_force_reinit(); thunks_initialized = true; mutex_unlock(&text_mutex); } +void *callthunks_translate_call_dest(void *dest) +{ + void *target; + + lockdep_assert_held(&text_mutex); + + if (!thunks_initialized || skip_addr(dest)) + return dest; + + if (!is_coretext(NULL, dest)) + return dest; + + target = patch_dest(dest, false); + return target ? : dest; +} + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 5d3844a98373..2ebc338980bc 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,6 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, switch (type) { case CALL: + func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; diff --git a/include/linux/static_call.h b/include/linux/static_call.h index df53bed9d71f..141e6b176a1b 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -162,6 +162,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool extern int __init static_call_init(void); +extern void static_call_force_reinit(void); + struct static_call_mod { struct static_call_mod *next; struct module *mod; /* for vmlinux, mod == NULL */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index dc5665b62814..639397b5491c 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,18 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static bool static_call_initialized; +static int static_call_initialized; + +/* + * Must be called before early_initcall() to be effective. + */ +void static_call_force_reinit(void) +{ + if (WARN_ON_ONCE(!static_call_initialized)) + return; + + static_call_initialized++; +} /* mutex to protect key modules/sites */ static DEFINE_MUTEX(static_call_mutex); @@ -475,7 +486,8 @@ int __init static_call_init(void) { int ret; - if (static_call_initialized) + /* See static_call_force_reinit(). */ + if (static_call_initialized == 1) return 0; cpus_read_lock(); @@ -490,11 +502,12 @@ int __init static_call_init(void) BUG(); } - static_call_initialized = true; - #ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); + if (!static_call_initialized) + register_module_notifier(&static_call_module_nb); #endif + + static_call_initialized = 1; return 0; } early_initcall(static_call_init); -- cgit v1.2.3 From f1389181622a08d6f1c71407a64df36b809d632b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:32 +0200 Subject: kallsyms: Take callthunks into account Since the pre-symbol function padding is an integral part of the symbol make kallsyms report it as part of the symbol by reporting it as sym-x instead of prev_sym+y. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.409656012@infradead.org --- kernel/kallsyms.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 60c20f301a6b..cc244c02b4cf 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -293,6 +293,12 @@ static unsigned long get_symbol_pos(unsigned long addr, return low; } +#ifdef CONFIG_FUNCTION_PADDING_BYTES +#define PADDING_BYTES CONFIG_FUNCTION_PADDING_BYTES +#else +#define PADDING_BYTES 0 +#endif + /* * Lookup an address but don't bother to find any names. */ @@ -300,13 +306,25 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + int ret; + + addr += PADDING_BYTES; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); - return 1; + ret = 1; + goto found; + } + + ret = !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf); + if (!ret) { + ret = !!__bpf_address_lookup(addr, symbolsize, + offset, namebuf); } - return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); +found: + if (ret && offset) + *offset -= PADDING_BYTES; + return ret; } static const char *kallsyms_lookup_buildid(unsigned long addr, @@ -319,6 +337,8 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -348,6 +368,8 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, found: cleanup_symbol_name(namebuf); + if (ret && offset) + *offset -= PADDING_BYTES; return ret; } @@ -374,6 +396,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -401,6 +425,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -417,6 +443,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, return res; found: + if (offset) + *offset -= PADDING_BYTES; cleanup_symbol_name(name); return 0; } @@ -442,8 +470,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, len = strlen(buffer); offset -= symbol_offset; - if (add_offset) - len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); + if (add_offset) { + char s = '+'; + + if ((long)offset < 0) { + s = '-'; + offset = 0UL - offset; + } + len += sprintf(buffer + len, "%c%#lx/%#lx", s, offset, size); + } if (modname) { len += sprintf(buffer + len, " [%s", modname); -- cgit v1.2.3 From 396e0b8e09e86440c2119d12c2101110d3cd5bf9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:33 +0200 Subject: x86/orc: Make it callthunk aware Callthunks addresses on the stack would confuse the ORC unwinder. Handle them correctly and tell ORC to proceed further down the stack. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.511637628@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 13 +++++++++++++ arch/x86/kernel/unwind_orc.c | 21 ++++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 07ac25793a3f..4b8cd256c95e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -92,6 +92,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); +extern bool is_callthunk(void *addr); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -101,6 +102,10 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) { return dest; } +static __always_inline bool is_callthunk(void *addr) +{ + return false; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 071003605a86..7f9788194eb5 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -293,6 +293,19 @@ void *callthunks_translate_call_dest(void *dest) return target ? : dest; } +bool is_callthunk(void *addr) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + unsigned long dest; + + dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); + if (!thunks_initialized || skip_addr((void *)dest)) + return false; + + return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); +} + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ea57da92940..cfac2b54b37b 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -136,6 +136,21 @@ static struct orc_entry null_orc_entry = { .type = UNWIND_HINT_TYPE_CALL }; +#ifdef CONFIG_CALL_THUNKS +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + if (!is_callthunk((void *)ip)) + return NULL; + + return &null_orc_entry; +} +#else +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + return NULL; +} +#endif + /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { .type = UNWIND_HINT_TYPE_CALL, @@ -189,7 +204,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; - return orc_ftrace_find(ip); + orc = orc_ftrace_find(ip); + if (orc) + return orc; + + return orc_callthunk_find(ip); } #ifdef CONFIG_MODULES -- cgit v1.2.3 From b2e9dfe54be4d023124d588d6f03d16a9c0d2507 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:34 +0200 Subject: x86/bpf: Emit call depth accounting if required Ensure that calls in BPF jitted programs are emitting call depth accounting when enabled to keep the call/return balanced. The return thunk jump is already injected due to the earlier retbleed mitigations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.615413406@infradead.org --- arch/x86/include/asm/alternative.h | 6 ++++++ arch/x86/kernel/callthunks.c | 19 +++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 32 +++++++++++++++++++++++--------- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4b8cd256c95e..664c0779375c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -93,6 +93,7 @@ extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); extern bool is_callthunk(void *addr); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -106,6 +107,11 @@ static __always_inline bool is_callthunk(void *addr) { return false; } +static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, + void *func) +{ + return 0; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7f9788194eb5..a03d646b5e69 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -306,6 +306,25 @@ bool is_callthunk(void *addr) return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); } +#ifdef CONFIG_BPF_JIT +int x86_call_depth_emit_accounting(u8 **pprog, void *func) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + + if (!thunks_initialized) + return 0; + + /* Is function call target a thunk? */ + if (is_callthunk(func)) + return 0; + + memcpy(*pprog, tmpl, tmpl_size); + *pprog += tmpl_size; + return tmpl_size; +} +#endif + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ad8cb7f15ab8..a6b46740ea30 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -340,6 +340,13 @@ static int emit_call(u8 **pprog, void *func, void *ip) return emit_patch(pprog, func, ip, 0xE8); } +static int emit_rsb_call(u8 **pprog, void *func, void *ip) +{ + OPTIMIZER_HIDE_VAR(func); + x86_call_depth_emit_accounting(pprog, func); + return emit_patch(pprog, func, ip, 0xE8); +} + static int emit_jump(u8 **pprog, void *func, void *ip) { return emit_patch(pprog, func, ip, 0xE9); @@ -1436,19 +1443,26 @@ st: if (is_imm8(insn->off)) break; /* call */ - case BPF_JMP | BPF_CALL: + case BPF_JMP | BPF_CALL: { + int offs; + func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(bpf_prog->aux->stack_depth, 8) - 8); - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + if (!imm32) return -EINVAL; + offs = 7 + x86_call_depth_emit_accounting(&prog, func); } else { - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + if (!imm32) return -EINVAL; + offs = x86_call_depth_emit_accounting(&prog, func); } + if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + return -EINVAL; break; + } case BPF_JMP | BPF_TAIL_CALL: if (imm32) @@ -1854,7 +1868,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, enter, prog)) + if (emit_rsb_call(&prog, enter, prog)) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1875,7 +1889,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, (long) p->insnsi >> 32, (u32) (long) p->insnsi); /* call JITed bpf program or interpreter */ - if (emit_call(&prog, p->bpf_func, prog)) + if (emit_rsb_call(&prog, p->bpf_func, prog)) return -EINVAL; /* @@ -1899,7 +1913,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, exit, prog)) + if (emit_rsb_call(&prog, exit, prog)) return -EINVAL; *pprog = prog; @@ -2147,7 +2161,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_enter, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) { ret = -EINVAL; goto cleanup; } @@ -2179,7 +2193,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT2(0xff, 0xd0); /* call *rax */ } else { /* call original function */ - if (emit_call(&prog, orig_call, prog)) { + if (emit_rsb_call(&prog, orig_call, prog)) { ret = -EINVAL; goto cleanup; } @@ -2223,7 +2237,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i im->ip_epilogue = prog; /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_exit, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) { ret = -EINVAL; goto cleanup; } -- cgit v1.2.3 From eac828eaef295cd0cc8b58f55fa5c8401fdc2370 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:35 +0200 Subject: x86/ftrace: Remove ftrace_epilogue() Remove the weird jumps to RET and simply use RET. This then promotes ftrace_stub() to a real function; which becomes important for kcfi. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.719080593@infradead.org --- arch/x86/kernel/ftrace_64.S | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de561..a90c55a6b481 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -172,20 +172,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) +SYM_FUNC_START(ftrace_stub) UNWIND_HINT_FUNC - ENDBR RET -SYM_FUNC_END(ftrace_epilogue) +SYM_FUNC_END(ftrace_stub) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -262,14 +256,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +271,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) -- cgit v1.2.3 From 36b64f101219dd9e6e4f0ea880b64e8a90da547b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:36 +0200 Subject: x86/ftrace: Rebalance RSB ftrace_regs_caller() uses a PUSH;RET pattern to tail-call into a direct-call function, this unbalances the RSB, fix that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.823216933@infradead.org --- arch/x86/kernel/ftrace_64.S | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a90c55a6b481..b5b54f58957e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -271,6 +271,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC + + /* + * The above left an extra return value on the stack; effectively + * doing a tail-call without using a register. This PUSH;RET + * pattern unbalances the RSB, inject a pointless CALL to rebalance. + */ + ANNOTATE_INTRA_FUNCTION_CALL + CALL .Ldo_rebalance + int3 +.Ldo_rebalance: + add $8, %rsp RET SYM_FUNC_END(ftrace_regs_caller) -- cgit v1.2.3 From ee3e2469b3463d28ca4cde20e0283319ac6a562d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:37 +0200 Subject: x86/ftrace: Make it call depth tracking aware Since ftrace has trampolines, don't use thunks for the __fentry__ site but instead require that every function called from there includes accounting. This very much includes all the direct-call functions. Additionally, ftrace uses ROP tricks in two places: - return_to_handler(), and - ftrace_regs_caller() when pt_regs->orig_ax is set by a direct-call. return_to_handler() already uses a retpoline to replace an indirect-jump to defeat IBT, since this is a jump-type retpoline, make sure there is no accounting done and ALTERNATIVE the RET into a ret. ftrace_regs_caller() does much the same and gets the same treatment. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.927545073@infradead.org --- arch/x86/include/asm/nospec-branch.h | 9 +++++++++ arch/x86/kernel/callthunks.c | 2 +- arch/x86/kernel/ftrace.c | 16 ++++++++++++---- arch/x86/kernel/ftrace_64.S | 22 ++++++++++++++++++++-- arch/x86/net/bpf_jit_comp.c | 6 ++++++ kernel/trace/trace_selftest.c | 9 ++++++++- samples/ftrace/ftrace-direct-modify.c | 3 +++ samples/ftrace/ftrace-direct-multi-modify.c | 3 +++ samples/ftrace/ftrace-direct-multi.c | 2 ++ samples/ftrace/ftrace-direct-too.c | 2 ++ samples/ftrace/ftrace-direct.c | 2 ++ 11 files changed, 68 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4771147c7c5a..82580adbca4b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -343,6 +343,12 @@ static inline void x86_set_skl_return_thunk(void) { x86_return_thunk = &__x86_return_skl; } + +#define CALL_DEPTH_ACCOUNT \ + ALTERNATIVE("", \ + __stringify(INCREMENT_CALL_DEPTH), \ + X86_FEATURE_CALL_DEPTH) + #ifdef CONFIG_CALL_THUNKS_DEBUG DECLARE_PER_CPU(u64, __x86_call_count); DECLARE_PER_CPU(u64, __x86_ret_count); @@ -351,6 +357,9 @@ DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif #else static inline void x86_set_skl_return_thunk(void) {} + +#define CALL_DEPTH_ACCOUNT "" + #endif #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index a03d646b5e69..7d2c75ec9a8c 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -316,7 +316,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; /* Is function call target a thunk? */ - if (is_callthunk(func)) + if (func && is_callthunk(func)) return 0; memcpy(*pprog, tmpl, tmpl_size); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4ac6692d5ef8..cf15ef5aecff 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void) static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting itself. + */ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } @@ -317,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long size; unsigned long *ptr; void *trampoline; - void *ip; + void *ip, *dest; /* 48 8b 15 is movq (%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; @@ -404,10 +408,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the call to the function */ mutex_lock(&text_mutex); call_offset -= start_offset; + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting before the call already. + */ + dest = ftrace_ops_get_func(ops); memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, - trampoline + call_offset, - ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index b5b54f58957e..6a7e6d666a12 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -132,6 +133,7 @@ #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -140,6 +142,8 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + CALL_DEPTH_ACCOUNT + /* Stack - skipping return address of ftrace_caller */ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx movq %rcx, RSP(%rsp) @@ -155,6 +159,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Only ops with REGS flag set should have CS register set */ movq $0, CS(%rsp) + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -189,6 +196,8 @@ SYM_FUNC_START(ftrace_regs_caller) save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ @@ -219,6 +228,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* regs go into 4th parameter */ leaq (%rsp), %rcx + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -282,7 +294,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) int3 .Ldo_rebalance: add $8, %rsp - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) @@ -291,6 +305,8 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -347,6 +363,8 @@ SYM_CODE_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a6b46740ea30..f46b62029d91 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2135,6 +2136,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog = image; EMIT_ENDBR(); + /* + * This is the direct-call trampoline, as such it needs accounting + * for the __fentry__ call. + */ + x86_call_depth_emit_accounting(&prog, NULL); EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a2d301f58ced..ff0536cea968 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -noinline __noclone static void trace_direct_tramp(void) { } +#ifndef CALL_DEPTH_ACCOUNT +#define CALL_DEPTH_ACCOUNT "" +#endif + +noinline __noclone static void trace_direct_tramp(void) +{ + asm(CALL_DEPTH_ACCOUNT); +} #endif /* diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index 39146fa83e20..de5a0f67f320 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -3,6 +3,7 @@ #include #include #include +#include extern void my_direct_func1(void); extern void my_direct_func2(void); @@ -34,6 +35,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " call my_direct_func1\n" " leave\n" " .size my_tramp1, .-my_tramp1\n" @@ -45,6 +47,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " call my_direct_func2\n" " leave\n" ASM_RET diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index 65aa94d96f4e..d52370cad0b6 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -3,6 +3,7 @@ #include #include #include +#include extern void my_direct_func1(unsigned long ip); extern void my_direct_func2(unsigned long ip); @@ -32,6 +33,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func1\n" @@ -46,6 +48,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func2\n" diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index 41ded7c615c7..ec1088922517 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -5,6 +5,7 @@ #include #include #include +#include extern void my_direct_func(unsigned long ip); @@ -27,6 +28,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func\n" diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 6690468c5cc2..e13fb59a2b47 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -4,6 +4,7 @@ #include /* for handle_mm_fault() */ #include #include +#include extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -29,6 +30,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " pushq %rsi\n" " pushq %rdx\n" diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index e8f1e440b9b8..1f769d0db20f 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -4,6 +4,7 @@ #include /* for wake_up_process() */ #include #include +#include extern void my_direct_func(struct task_struct *p); @@ -26,6 +27,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " call my_direct_func\n" " popq %rdi\n" -- cgit v1.2.3 From d82a0345cf218f5050f5ad913e1ae6c579105731 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:38 +0200 Subject: x86/retbleed: Add call depth tracking mitigation The fully secure mitigation for RSB underflow on Intel SKL CPUs is IBRS, which inflicts up to 30% penalty for pathological syscall heavy work loads. Software based call depth tracking and RSB refill is not perfect, but reduces the attack surface massively. The penalty for the pathological case is about 8% which is still annoying but definitely more palatable than IBRS. Add a retbleed=stuff command line option to enable the call depth tracking and software refill of the RSB. This gives admins a choice. IBeeRS are safe and cause headaches, call depth tracking is considered to be s(t)ufficiently safe. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111149.029587352@infradead.org --- arch/x86/kernel/cpu/bugs.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index da7c361f47e0..e6c23ead1617 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -787,6 +787,7 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, + RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { @@ -794,6 +795,7 @@ enum retbleed_mitigation_cmd { RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, + RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { @@ -802,6 +804,7 @@ static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", + [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = @@ -831,6 +834,8 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "stuff")) { + retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else { @@ -879,6 +884,21 @@ static void __init retbleed_select_mitigation(void) } break; + case RETBLEED_CMD_STUFF: + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + + } else { + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + else + pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + + goto do_cmd_auto; + } + break; + do_cmd_auto: case RETBLEED_CMD_AUTO: default: @@ -916,6 +936,12 @@ do_cmd_auto: mitigate_smt = true; break; + case RETBLEED_MITIGATION_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + x86_set_skl_return_thunk(); + break; + default: break; } @@ -926,7 +952,7 @@ do_cmd_auto: /* * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option. + * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { @@ -939,7 +965,8 @@ do_cmd_auto: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - pr_err(RETBLEED_INTEL_MSG); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } } @@ -1413,6 +1440,7 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && + retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; -- cgit v1.2.3 From 5c9a92dec3235b0c1d51e92860f8014753161593 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 17 Oct 2022 16:41:20 +0200 Subject: x86/bugs: Add retbleed=force Debug aid, allows running retbleed=force,stuff on non-affected uarchs Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/kernel/cpu/bugs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e6c23ead1617..b307b83e22be 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -838,6 +838,8 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } -- cgit v1.2.3 From dbf53a29b28b277fa952a000245b558536c6bdd7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 19 Oct 2022 18:59:45 +0200 Subject: x86/paravirt: Fix a !PARAVIRT build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ./include/trace/events/xen.h:28:31: warning: ‘enum paravirt_lazy_mode’ \ declared inside parameter list will not be visible outside of this definition or declaration which turns into a build error: ./include/trace/events/xen.h:28:50: error: parameter 1 (‘mode’) has incomplete type 28 | TP_PROTO(enum paravirt_lazy_mode mode), \ due to enum paravirt_lazy_mode being visible only under CONFIG_PARAVIRT. Just pull it up where it is unconditionally visible. Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y1AtAXM8YjtBm2cj@zn.tnic --- arch/x86/include/asm/paravirt_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e137d9412123..27c692791b7e 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -9,6 +9,13 @@ struct paravirt_patch_site { u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; #endif #ifdef CONFIG_PARAVIRT @@ -582,13 +589,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - enum paravirt_lazy_mode paravirt_get_lazy_mode(void); void paravirt_start_context_switch(struct task_struct *prev); void paravirt_end_context_switch(struct task_struct *next); -- cgit v1.2.3 From ae25e00ba84073450c07d8ffd2d74f914a027230 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 Oct 2022 18:32:49 +0300 Subject: x86/retpoline: Fix crash printing warning The first argument of WARN() is a condition, so this will use "addr" as the format string and possibly crash. Fixes: 3b6c1747da48 ("x86/retpoline: Add SKL retthunk retpolines") Signed-off-by: Dan Carpenter Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/Y1gBoUZrRK5N%2FlCB@kili/ --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 19221d77dc27..b4ac4e58c010 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -418,7 +418,7 @@ clang_jcc: break; default: - WARN("%pS %px %*ph\n", addr, addr, 6, addr); + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } -- cgit v1.2.3 From b1f37ef655cf372f96015bf54abdb76a91aff27e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Oct 2022 11:10:56 +0100 Subject: x86: Unconfuse CONFIG_ and X86_FEATURE_ namespaces Lukas reported someone fat fingered the CONFIG_ symbol; fix er up. Fixes: 5d8213864ade ("x86/retbleed: Add SKL return thunk") Reported-by: Lukas Bulwahn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y1+fL4qQEIGZEEKB@hirez.programming.kicks-ass.net --- arch/x86/include/asm/nospec-branch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 82580adbca4b..3ab90f23e7f7 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -285,7 +285,7 @@ */ .macro UNTRAIN_RET #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_X86_FEATURE_CALL_DEPTH) + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ @@ -296,7 +296,7 @@ .macro UNTRAIN_RET_FROM_CALL #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_X86_FEATURE_CALL_DEPTH) + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ -- cgit v1.2.3 From 5ebddd7c4951c50142bcb239d4c6a82eff15759e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:26:51 +0200 Subject: kallsyms: Revert "Take callthunks into account" This is a full revert of commit: f1389181622a ("kallsyms: Take callthunks into account") The commit assumes a number of things that are not quite right. Notably it assumes every symbol has PADDING_BYTES in front of it that are not claimed by another symbol. This is not true; even when compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} Notably things like .cold subfunctions do not need to adhere to this change in ABI. It it also not true when build with CFI_CLANG, which claims these PADDING_BYTES in the __cfi_##name symbol. Once the prefix bytes are not consistent and or otherwise claimed the approach this patch takes goes out the window and kallsym resolution will report invalid symbol names. Therefore revert this to make room for another approach. Reported-by: Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lore.kernel.org/r/202210241614.2ae4c1f5-yujie.liu@intel.com Link: https://lkml.kernel.org/r/20221028194453.330970755@infradead.org --- kernel/kallsyms.c | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cc244c02b4cf..60c20f301a6b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -293,12 +293,6 @@ static unsigned long get_symbol_pos(unsigned long addr, return low; } -#ifdef CONFIG_FUNCTION_PADDING_BYTES -#define PADDING_BYTES CONFIG_FUNCTION_PADDING_BYTES -#else -#define PADDING_BYTES 0 -#endif - /* * Lookup an address but don't bother to find any names. */ @@ -306,25 +300,13 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; - int ret; - - addr += PADDING_BYTES; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); - ret = 1; - goto found; - } - - ret = !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf); - if (!ret) { - ret = !!__bpf_address_lookup(addr, symbolsize, - offset, namebuf); + return 1; } -found: - if (ret && offset) - *offset -= PADDING_BYTES; - return ret; + return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } static const char *kallsyms_lookup_buildid(unsigned long addr, @@ -337,8 +319,6 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -368,8 +348,6 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, found: cleanup_symbol_name(namebuf); - if (ret && offset) - *offset -= PADDING_BYTES; return ret; } @@ -396,8 +374,6 @@ int lookup_symbol_name(unsigned long addr, char *symname) symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -425,8 +401,6 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -443,8 +417,6 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, return res; found: - if (offset) - *offset -= PADDING_BYTES; cleanup_symbol_name(name); return 0; } @@ -470,15 +442,8 @@ static int __sprint_symbol(char *buffer, unsigned long address, len = strlen(buffer); offset -= symbol_offset; - if (add_offset) { - char s = '+'; - - if ((long)offset < 0) { - s = '-'; - offset = 0UL - offset; - } - len += sprintf(buffer + len, "%c%#lx/%#lx", s, offset, size); - } + if (add_offset) + len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) { len += sprintf(buffer + len, " [%s", modname); -- cgit v1.2.3 From 4c91be8e926c6b3734d59b9348e305431484d42b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:49:26 +0200 Subject: objtool: Slice up elf_create_section_symbol() In order to facilitate creation of more symbol types, slice up elf_create_section_symbol() to extract a generic helper that deals with adding ELF symbols. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.396634875@infradead.org --- tools/objtool/elf.c | 56 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 89b37cd4ab1d..3ad89d963e59 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -717,11 +717,11 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, } static struct symbol * -elf_create_section_symbol(struct elf *elf, struct section *sec) +__elf_create_symbol(struct elf *elf, struct symbol *sym) { struct section *symtab, *symtab_shndx; Elf32_Word first_non_local, new_idx; - struct symbol *sym, *old; + struct symbol *old; symtab = find_section_by_name(elf, ".symtab"); if (symtab) { @@ -731,27 +731,16 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) return NULL; } - sym = calloc(1, sizeof(*sym)); - if (!sym) { - perror("malloc"); - return NULL; - } - - sym->name = sec->name; - sym->sec = sec; + new_idx = symtab->sh.sh_size / symtab->sh.sh_entsize; - // st_name 0 - sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); - // st_other 0 - // st_value 0 - // st_size 0 + if (GELF_ST_BIND(sym->sym.st_info) != STB_LOCAL) + goto non_local; /* * Move the first global symbol, as per sh_info, into a new, higher * symbol index. This fees up a spot for a new local symbol. */ first_non_local = symtab->sh.sh_info; - new_idx = symtab->sh.sh_size / symtab->sh.sh_entsize; old = find_symbol_by_index(elf, first_non_local); if (old) { old->idx = new_idx; @@ -769,18 +758,43 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) new_idx = first_non_local; } + /* + * Either way, we will add a LOCAL symbol. + */ + symtab->sh.sh_info += 1; + +non_local: sym->idx = new_idx; if (elf_update_symbol(elf, symtab, symtab_shndx, sym)) { WARN("elf_update_symbol"); return NULL; } - /* - * Either way, we added a LOCAL symbol. - */ - symtab->sh.sh_info += 1; + return sym; +} + +static struct symbol * +elf_create_section_symbol(struct elf *elf, struct section *sec) +{ + struct symbol *sym = calloc(1, sizeof(*sym)); + + if (!sym) { + perror("malloc"); + return NULL; + } - elf_add_symbol(elf, sym); + sym->name = sec->name; + sym->sec = sec; + + // st_name 0 + sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); + // st_other 0 + // st_value 0 + // st_size 0 + + sym = __elf_create_symbol(elf, sym); + if (sym) + elf_add_symbol(elf, sym); return sym; } -- cgit v1.2.3 From 13f60e80e15dd0657c90bcca372ba045630ed9de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 20:29:51 +0200 Subject: objtool: Avoid O(bloody terrible) behaviour -- an ode to libelf Due to how gelf_update_sym*() requires an Elf_Data pointer, and how libelf keeps Elf_Data in a linked list per section, elf_update_symbol() ends up having to iterate this list on each update to find the correct Elf_Data for the index'ed symbol. By allocating one Elf_Data per new symbol, the list grows per new symbol, giving an effective O(n^2) insertion time. This is obviously bloody terrible. Therefore over-allocate the Elf_Data when an extention is needed. Except it turns out libelf disregards Elf_Scn::sh_size in favour of the sum of Elf_Data::d_size. IOW it will happily write out all the unused space and fill it with: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND entries (aka zeros). Which obviously violates the STB_LOCAL placement rule, and is a general pain in the backside for not being the desired behaviour. Manually fix-up the Elf_Data size to avoid this problem before calling elf_update(). This significantly improves performance when adding a significant number of symbols. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.461658986@infradead.org --- tools/objtool/elf.c | 89 ++++++++++++++++++++++++++++++++++--- tools/objtool/include/objtool/elf.h | 2 +- 2 files changed, 84 insertions(+), 7 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 3ad89d963e59..36dc78796f58 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -634,6 +634,12 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, /* end-of-list */ if (!symtab_data) { + /* + * Over-allocate to avoid O(n^2) symbol creation + * behaviour. The down side is that libelf doesn't + * like this; see elf_truncate_section() for the fixup. + */ + int num = max(1U, sym->idx/3); void *buf; if (idx) { @@ -647,28 +653,34 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, if (t) shndx_data = elf_newdata(t); - buf = calloc(1, entsize); + buf = calloc(num, entsize); if (!buf) { WARN("malloc"); return -1; } symtab_data->d_buf = buf; - symtab_data->d_size = entsize; + symtab_data->d_size = num * entsize; symtab_data->d_align = 1; symtab_data->d_type = ELF_T_SYM; - symtab->sh.sh_size += entsize; symtab->changed = true; + symtab->truncate = true; if (t) { - shndx_data->d_buf = &sym->sec->idx; - shndx_data->d_size = sizeof(Elf32_Word); + buf = calloc(num, sizeof(Elf32_Word)); + if (!buf) { + WARN("malloc"); + return -1; + } + + shndx_data->d_buf = buf; + shndx_data->d_size = num * sizeof(Elf32_Word); shndx_data->d_align = sizeof(Elf32_Word); shndx_data->d_type = ELF_T_WORD; - symtab_shndx->sh.sh_size += sizeof(Elf32_Word); symtab_shndx->changed = true; + symtab_shndx->truncate = true; } break; @@ -770,6 +782,14 @@ non_local: return NULL; } + symtab->sh.sh_size += symtab->sh.sh_entsize; + symtab->changed = true; + + if (symtab_shndx) { + symtab_shndx->sh.sh_size += sizeof(Elf32_Word); + symtab_shndx->changed = true; + } + return sym; } @@ -1286,6 +1306,60 @@ int elf_write_reloc(struct elf *elf, struct reloc *reloc) return 0; } +/* + * When Elf_Scn::sh_size is smaller than the combined Elf_Data::d_size + * do you: + * + * A) adhere to the section header and truncate the data, or + * B) ignore the section header and write out all the data you've got? + * + * Yes, libelf sucks and we need to manually truncate if we over-allocate data. + */ +static int elf_truncate_section(struct elf *elf, struct section *sec) +{ + u64 size = sec->sh.sh_size; + bool truncated = false; + Elf_Data *data = NULL; + Elf_Scn *s; + + s = elf_getscn(elf->elf, sec->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + for (;;) { + /* get next data descriptor for the relevant section */ + data = elf_getdata(s, data); + + if (!data) { + if (size) { + WARN("end of section data but non-zero size left\n"); + return -1; + } + return 0; + } + + if (truncated) { + /* when we remove symbols */ + WARN("truncated; but more data\n"); + return -1; + } + + if (!data->d_size) { + WARN("zero size data"); + return -1; + } + + if (data->d_size > size) { + truncated = true; + data->d_size = size; + } + + size -= data->d_size; + } +} + int elf_write(struct elf *elf) { struct section *sec; @@ -1296,6 +1370,9 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { + if (sec->truncate) + elf_truncate_section(elf, sec); + if (sec->changed) { s = elf_getscn(elf->elf, sec->idx); if (!s) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index d28533106b78..9e96a613c50f 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,7 @@ struct section { Elf_Data *data; char *name; int idx; - bool changed, text, rodata, noinstr, init; + bool changed, text, rodata, noinstr, init, truncate; }; struct symbol { -- cgit v1.2.3 From 9f2899fe36a623885d8576604cb582328ad32b3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:50:42 +0200 Subject: objtool: Add option to generate prefix symbols When code is compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} functions will have PADDING_BYTES of NOP in front of them. Unwinders and other things that symbolize code locations will typically attribute these bytes to the preceding function. Given that these bytes nominally belong to the following symbol this mis-attribution is confusing. Inspired by the fact that CFI_CLANG emits __cfi_##name symbols to claim these bytes, allow objtool to emit __pfx_##name symbols to do the same. Therefore add the objtool --prefix=N argument, to conditionally place a __pfx_##name symbol at N bytes ahead of symbol 'name' when: all these preceding bytes are NOP and name-N is an instruction boundary. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.526899822@infradead.org --- tools/objtool/builtin-check.c | 1 + tools/objtool/check.c | 33 ++++++++++++++++++++++++++++++++- tools/objtool/elf.c | 31 +++++++++++++++++++++++++++++++ tools/objtool/include/objtool/builtin.h | 1 + tools/objtool/include/objtool/elf.h | 2 ++ 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 0a04f8ea4432..95fcecee60ce 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -75,6 +75,7 @@ const struct option check_options[] = { OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), + OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7936312e10c7..27f35f5f831a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3417,7 +3417,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ - if (!strncmp(func->name, "__cfi_", 6)) + if (!strncmp(func->name, "__cfi_", 6) || + !strncmp(func->name, "__pfx_", 6)) return 0; WARN("%s() falls through to next function %s()", @@ -3972,6 +3973,34 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } +static int add_prefix_symbol(struct objtool_file *file, struct symbol *func, + struct instruction *insn) +{ + if (!opts.prefix) + return 0; + + for (;;) { + struct instruction *prev = list_prev_entry(insn, list); + u64 offset; + + if (&prev->list == &file->insn_list) + break; + + if (prev->type != INSN_NOP) + break; + + offset = func->offset - prev->offset; + if (offset >= opts.prefix) { + if (offset == opts.prefix) + elf_create_prefix_symbol(file->elf, func, opts.prefix); + break; + } + insn = prev; + } + + return 0; +} + static int validate_symbol(struct objtool_file *file, struct section *sec, struct symbol *sym, struct insn_state *state) { @@ -3990,6 +4019,8 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (!insn || insn->ignore || insn->visited) return 0; + add_prefix_symbol(file, sym, insn); + state->uaccess = sym->uaccess_safe; ret = validate_branch(file, insn_func(insn), insn, *state); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 36dc78796f58..3d636d12d679 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -819,6 +819,37 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) return sym; } +static int elf_add_string(struct elf *elf, struct section *strtab, char *str); + +struct symbol * +elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size) +{ + struct symbol *sym = calloc(1, sizeof(*sym)); + size_t namelen = strlen(orig->name) + sizeof("__pfx_"); + char *name = malloc(namelen); + + if (!sym || !name) { + perror("malloc"); + return NULL; + } + + snprintf(name, namelen, "__pfx_%s", orig->name); + + sym->name = name; + sym->sec = orig->sec; + + sym->sym.st_name = elf_add_string(elf, NULL, name); + sym->sym.st_info = orig->sym.st_info; + sym->sym.st_value = orig->sym.st_value - size; + sym->sym.st_size = size; + + sym = __elf_create_symbol(elf, sym); + if (sym) + elf_add_symbol(elf, sym); + + return sym; +} + int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct section *insn_sec, unsigned long insn_off) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 22092a9f3cf6..f341b620dead 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool stackval; bool static_call; bool uaccess; + int prefix; /* options: */ bool backtrace; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 9e96a613c50f..b6974e3173aa 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -146,6 +146,8 @@ static inline bool has_multiple_files(struct elf *elf) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); +struct symbol *elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size); + int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, s64 addend); int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, -- cgit v1.2.3 From b341b20d648bb7e9a3307c33163e7399f0913e66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 21:08:19 +0200 Subject: x86: Add prefix symbols for function padding When code is compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} functions will have PADDING_BYTES of NOP in front of them. Unwinders and other things that symbolize code locations will typically attribute these bytes to the preceding function. Given that these bytes nominally belong to the following symbol this mis-attribution is confusing. Inspired by the fact that CFI_CLANG emits __cfi_##name symbols to claim these bytes, use objtool to emit __pfx_##name symbols to do the same when CFI_CLANG is not used. This then shows the callthunk for symbol 'name' as: __pfx_##name+0x6/0x10 Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.592512209@infradead.org --- arch/x86/Kconfig | 4 ++++ scripts/Makefile.lib | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b52ad13f0f44..32818aa1dca4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2471,6 +2471,10 @@ config CALL_THUNKS def_bool n select FUNCTION_ALIGNMENT_16B +config PREFIX_SYMBOLS + def_bool y + depends on CALL_THUNKS && !CFI_CLANG + menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" default y diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 85f02756dc9c..2e03bcbf2b9b 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -265,6 +265,7 @@ objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable +objtool-args-$(CONFIG_PREFIX_SYMBOLS) += --prefix=$(CONFIG_FUNCTION_PADDING_BYTES) objtool-args = $(objtool-args-y) \ $(if $(delay-objtool), --link) \ -- cgit v1.2.3 From 9a479f766be1dd777e12e3e57b6ee4c3028a40a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:13 +0200 Subject: objtool: Add --cfi to generate the .cfi_sites section Add the location of all __cfi_##name symbols (as generated by kCFI) to a section such that we might re-write things at kernel boot. Notably; boot time re-hashing and FineIBT are the intended use of this. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221027092842.568039454@infradead.org --- tools/objtool/builtin-check.c | 1 + tools/objtool/check.c | 69 +++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/builtin.h | 1 + 3 files changed, 71 insertions(+) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 95fcecee60ce..868e3e363786 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -80,6 +80,7 @@ const struct option check_options[] = { OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), + OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 27f35f5f831a..55066c493570 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -861,6 +861,68 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) return 0; } +static int create_cfi_sections(struct objtool_file *file) +{ + struct section *sec, *s; + struct symbol *sym; + unsigned int *loc; + int idx; + + sec = find_section_by_name(file->elf, ".cfi_sites"); + if (sec) { + INIT_LIST_HEAD(&file->call_list); + WARN("file already has .cfi_sites section, skipping"); + return 0; + } + + idx = 0; + for_each_sec(file, s) { + if (!s->text) + continue; + + list_for_each_entry(sym, &s->symbol_list, list) { + if (sym->type != STT_FUNC) + continue; + + if (strncmp(sym->name, "__cfi_", 6)) + continue; + + idx++; + } + } + + sec = elf_create_section(file->elf, ".cfi_sites", 0, sizeof(unsigned int), idx); + if (!sec) + return -1; + + idx = 0; + for_each_sec(file, s) { + if (!s->text) + continue; + + list_for_each_entry(sym, &s->symbol_list, list) { + if (sym->type != STT_FUNC) + continue; + + if (strncmp(sym->name, "__cfi_", 6)) + continue; + + loc = (unsigned int *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned int)); + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned int), + R_X86_64_PC32, + s, sym->offset)) + return -1; + + idx++; + } + } + + return 0; +} + static int create_mcount_loc_sections(struct objtool_file *file) { struct section *sec; @@ -4430,6 +4492,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.cfi) { + ret = create_cfi_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (opts.rethunk) { ret = create_return_sites_sections(file); if (ret < 0) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index f341b620dead..c44ff39df80c 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -27,6 +27,7 @@ struct opts { bool static_call; bool uaccess; int prefix; + bool cfi; /* options: */ bool backtrace; -- cgit v1.2.3 From 931ab63664f02b17d2213ef36b83e1e50190a0aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:14 +0200 Subject: x86/ibt: Implement FineIBT Implement an alternative CFI scheme that merges both the fine-grained nature of kCFI but also takes full advantage of the coarse grained hardware CFI as provided by IBT. To contrast: kCFI is a pure software CFI scheme and relies on being able to read text -- specifically the instruction *before* the target symbol, and does the hash validation *before* doing the call (otherwise control flow is compromised already). FineIBT is a software and hardware hybrid scheme; by ensuring every branch target starts with a hash validation it is possible to place the hash validation after the branch. This has several advantages: o the (hash) load is avoided; no memop; no RX requirement. o IBT WAIT-FOR-ENDBR state is a speculation stop; by placing the hash validation in the immediate instruction after the branch target there is a minimal speculation window and the whole is a viable defence against SpectreBHB. o Kees feels obliged to mention it is slightly more vulnerable when the attacker can write code. Obviously this patch relies on kCFI, but additionally it also relies on the padding from the call-depth-tracking patches. It uses this padding to place the hash-validation while the call-sites are re-written to modify the indirect target to be 16 bytes in front of the original target, thus hitting this new preamble. Notably, there is no hardware that needs call-depth-tracking (Skylake) and supports IBT (Tigerlake and onwards). Suggested-by: Joao Moreira (Intel) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.634714496@infradead.org --- arch/um/kernel/um_arch.c | 5 + arch/x86/Kconfig | 14 +- arch/x86/Makefile | 2 +- arch/x86/include/asm/alternative.h | 2 + arch/x86/include/asm/linkage.h | 6 +- arch/x86/kernel/alternative.c | 253 +++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/module.c | 20 ++- arch/x86/kernel/vmlinux.lds.S | 9 ++ include/linux/bpf.h | 2 +- scripts/Makefile.lib | 1 + 11 files changed, 294 insertions(+), 21 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8adf8e89b255..786b44dc20c9 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -444,6 +444,11 @@ void apply_returns(s32 *start, s32 *end) { } +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32818aa1dca4..479ee63898f5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2463,17 +2463,27 @@ config FUNCTION_PADDING_BYTES default FUNCTION_PADDING_CFI if CFI_CLANG default FUNCTION_ALIGNMENT +config CALL_PADDING + def_bool n + depends on CC_HAS_ENTRY_PADDING && OBJTOOL + select FUNCTION_ALIGNMENT_16B + +config FINEIBT + def_bool y + depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + select CALL_PADDING + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL config CALL_THUNKS def_bool n - select FUNCTION_ALIGNMENT_16B + select CALL_PADDING config PREFIX_SYMBOLS def_bool y - depends on CALL_THUNKS && !CFI_CLANG + depends on CALL_PADDING && !CFI_CLANG menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1640e005092b..a3a07df8a609 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,7 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif -ifdef CONFIG_CALL_THUNKS +ifdef CONFIG_CALL_PADDING PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) KBUILD_CFLAGS += $(PADDING_CFLAGS) export PADDING_CFLAGS diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 664c0779375c..7659217f4d49 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -78,6 +78,8 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; struct paravirt_patch_site; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 45e0df850645..dd9b8118f784 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -15,7 +15,7 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#if defined(CONFIG_CALL_THUNKS) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; #else #define FUNCTION_PADDING @@ -57,7 +57,7 @@ #endif /* __ASSEMBLY__ */ /* - * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_THUNKS) the + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the * CFI symbol layout changes. * * Without CALL_THUNKS: @@ -81,7 +81,7 @@ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. */ -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define CFI_PRE_PADDING #define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; #else diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b4ac4e58c010..91b0e63a6238 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -656,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt */ @@ -664,31 +687,232 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) - continue; + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_endbr(addr - 16, false); + } +} + +#else + +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; - if (WARN_ON_ONCE(!is_endbr(endbr))) + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, jmp, 2); + } - /* - * When we have IBT, the lack of ENDBR will trigger #CP - */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + return 0; +} + +/* .cfi_sites */ +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } + + return 0; +} + +/* .retpoline_sites */ +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT)) + return; + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using FineIBT CFI\n"); + + return; + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -996,6 +1220,9 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2bec4b4b2c50..423a760fa9de 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -609,6 +609,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 2fb9de2cef40..0142982e94c5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -255,7 +255,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, - *calls = NULL; + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -277,6 +277,8 @@ int module_finalize(const Elf_Ehdr *hdr, returns = s; if (!strcmp(".call_sites", secstrings + s->sh_name)) calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -289,6 +291,22 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49f3f86433c7..2e0ee14229bf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,15 @@ SECTIONS } #endif +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5296aea9b5b4..923a3d508047 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) #else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2e03bcbf2b9b..2b2fab705a63 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -256,6 +256,7 @@ objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt +objtool-args-$(CONFIG_FINEIBT) += --cfi objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_RETPOLINE) += --retpoline -- cgit v1.2.3 From 082c4c815252ea333b0f3a51e336df60c2314fe2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:15 +0200 Subject: x86/cfi: Boot time selection of CFI scheme Add the "cfi=" boot parameter to allow people to select a CFI scheme at boot time. Mostly useful for development / debugging. Requested-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.699804264@infradead.org --- arch/x86/kernel/alternative.c | 99 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 91b0e63a6238..9d3b58748ca5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -702,6 +702,47 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_FINEIBT + +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_DEFAULT; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + /* * kCFI FineIBT * @@ -868,30 +909,52 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; - if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT)) + if (cfi_mode == CFI_DEFAULT) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + cfi_mode = CFI_FINEIBT; + } + + switch (cfi_mode) { + case CFI_OFF: + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Disabling CFI\n"); return; - /* - * Rewrite the callers to not use the __cfi_ stubs, such that we might - * rewrite them. This disables all CFI. If this succeeds but any of the - * later stages fails, we're without CFI. - */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); - if (ret) - goto err; + case CFI_KCFI: + if (builtin) + pr_info("Using kCFI\n"); + return; - ret = cfi_rewrite_preamble(start_cfi, end_cfi); - if (ret) - goto err; + case CFI_FINEIBT: + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); - if (ret) - goto err; + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) + pr_info("Using FineIBT CFI\n"); + return; - return; + default: + break; + } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); -- cgit v1.2.3 From 0c3e806ec0f9771fa1f34c60499097d9260a8bb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:16 +0200 Subject: x86/cfi: Add boot time hash randomization In order to avoid known hashes (from knowing the boot image), randomize the CFI hashes with a per-boot random seed. Suggested-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.765195516@infradead.org --- arch/x86/kernel/alternative.c | 120 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 108 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3b58748ca5..aa7f791585c5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -711,6 +711,24 @@ enum cfi_mode { }; static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} static __init int cfi_parse_cmdline(char *str) { @@ -728,10 +746,13 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_DEFAULT; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; + cfi_rand = false; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -856,7 +877,50 @@ static int cfi_disable_callers(s32 *start, s32 *end) return 0; } +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ + continue; + + text_poke_early(addr, mov, 2); + } + + return 0; +} + /* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} + static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; @@ -879,6 +943,25 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) } /* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -915,31 +998,44 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_mode = CFI_FINEIBT; } - switch (cfi_mode) { - case CFI_OFF: - ret = cfi_disable_callers(start_retpoline, end_retpoline); + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) + cfi_seed = get_random_u32(); + + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: if (builtin) pr_info("Disabling CFI\n"); return; case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + if (builtin) pr_info("Using kCFI\n"); return; case CFI_FINEIBT: - /* - * Rewrite the callers to not use the __cfi_ stubs, such that we might - * rewrite them. This disables all CFI. If this succeeds but any of the - * later stages fails, we're without CFI. - */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); - if (ret) - goto err; - ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; -- cgit v1.2.3 From 19526717f768bf2f89ca01bd2a595728ebe57540 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Nov 2022 22:31:19 +0100 Subject: objtool: Optimize elf_dirty_reloc_sym() When moving a symbol in the symtab its index changes and any reloc referring that symtol-table-index will need to be rewritten too. In order to facilitate this, objtool simply marks the whole reloc section 'changed' which will cause the whole section to be re-generated. However, finding the relocs that use any given symbol is implemented rather crudely -- a fully iteration of all sections and their relocs. Given that some builds have over 20k sections (kallsyms etc..) iterating all that for *each* symbol moved takes a bit of time. Instead have each symbol keep a list of relocs that reference it. This *vastly* improves build times for certain configs. Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y2LlRA7x+8UsE1xf@hirez.programming.kicks-ass.net --- tools/objtool/elf.c | 27 ++++++++++----------------- tools/objtool/include/objtool/elf.h | 2 ++ 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 3d636d12d679..8cd7f018002c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -356,6 +356,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) struct rb_node *pnode; struct symbol *iter; + INIT_LIST_HEAD(&sym->reloc_list); INIT_LIST_HEAD(&sym->pv_target); sym->alias = sym; @@ -557,6 +558,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, reloc->sym = sym; reloc->addend = addend; + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); @@ -573,21 +575,10 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, */ static void elf_dirty_reloc_sym(struct elf *elf, struct symbol *sym) { - struct section *sec; - - list_for_each_entry(sec, &elf->sections, list) { - struct reloc *reloc; - - if (sec->changed) - continue; + struct reloc *reloc; - list_for_each_entry(reloc, &sec->reloc_list, list) { - if (reloc->sym == sym) { - sec->changed = true; - break; - } - } - } + list_for_each_entry(reloc, &sym->reloc_list, sym_reloc_entry) + reloc->sec->changed = true; } /* @@ -902,11 +893,12 @@ static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsi static int read_relocs(struct elf *elf) { + unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; struct section *sec; struct reloc *reloc; - int i; unsigned int symndx; - unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; + struct symbol *sym; + int i; if (!elf_alloc_hash(reloc, elf->text_size / 16)) return -1; @@ -947,13 +939,14 @@ static int read_relocs(struct elf *elf) reloc->sec = sec; reloc->idx = i; - reloc->sym = find_symbol_by_index(elf, symndx); + reloc->sym = sym = find_symbol_by_index(elf, symndx); if (!reloc->sym) { WARN("can't find reloc entry symbol %d for %s", symndx, sec->name); return -1; } + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index b6974e3173aa..bca719b2104b 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -62,6 +62,7 @@ struct symbol { u8 fentry : 1; u8 profiling_func : 1; struct list_head pv_target; + struct list_head reloc_list; }; struct reloc { @@ -73,6 +74,7 @@ struct reloc { }; struct section *sec; struct symbol *sym; + struct list_head sym_reloc_entry; unsigned long offset; unsigned int type; s64 addend; -- cgit v1.2.3 From 023f2340f053537cce170c31c430b0886c6f07ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Nov 2022 20:57:51 +0100 Subject: objtool: Fix weak hole vs prefix symbol Boris (and the robot) reported that objtool grew a new complaint about unreachable instructions. Upon inspection it was immediately clear the __weak zombie instructions struck again. For the unweary, the linker will simply remove the symbol for overriden __weak symbols but leave the instructions in place, creating unreachable instructions -- and objtool likes to report these. Commit 4adb23686795 ("objtool: Ignore extra-symbol code") was supposed to have dealt with that, but the new commit 9f2899fe36a6 ("objtool: Add option to generate prefix symbols") subtly broke that logic by created unvisited symbols. Fixes: 9f2899fe36a6 ("objtool: Add option to generate prefix symbols") Reported-by: Borislav Petkov Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 55066c493570..4f1a7384426b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4053,8 +4053,28 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func, offset = func->offset - prev->offset; if (offset >= opts.prefix) { - if (offset == opts.prefix) + if (offset == opts.prefix) { + /* + * Since the sec->symbol_list is ordered by + * offset (see elf_add_symbol()) the added + * symbol will not be seen by the iteration in + * validate_section(). + * + * Hence the lack of list_for_each_entry_safe() + * there. + * + * The direct concequence is that prefix symbols + * don't get visited (because pointless), except + * for the logic in ignore_unreachable_insn() + * that needs the terminating insn to be visited + * otherwise it will report the hole. + * + * Hence mark the first instruction of the + * prefix symbol as visisted. + */ + prev->visited |= VISITED_BRANCH; elf_create_prefix_symbol(file->elf, func, opts.prefix); + } break; } insn = prev; -- cgit v1.2.3 From b32fd8a60f5d855758208c2b5b49cba8087f03c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Nov 2022 21:17:03 +0100 Subject: x86,pm: Force out-of-line memcpy() GCC fancies inlining memcpy(), and because it cannot prove the destination is page-aligned (it is) it ends up generating atrocious code like: 19e: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1a5 1a1: R_X86_64_PC32 core_restore_code-0x4 1a5: 48 8d 78 08 lea 0x8(%rax),%rdi 1a9: 48 89 c1 mov %rax,%rcx 1ac: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1af: R_X86_64_32S core_restore_code 1b3: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi 1b7: 48 29 f9 sub %rdi,%rcx 1ba: 48 89 10 mov %rdx,(%rax) 1bd: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1c4 1c0: R_X86_64_PC32 core_restore_code+0xff4 1c4: 48 29 ce sub %rcx,%rsi 1c7: 81 c1 00 10 00 00 add $0x1000,%ecx 1cd: 48 89 90 f8 0f 00 00 mov %rdx,0xff8(%rax) 1d4: c1 e9 03 shr $0x3,%ecx 1d7: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) Notably the alignment code generates a text reference to code_restore_code+0xff8, for which objtool raises the objection: vmlinux.o: warning: objtool: relocate_restore_code+0x3d: relocation to !ENDBR: next_arg+0x18 Applying some __assume_aligned(PAGE_SIZE) improve code-gen to: 19e: 48 89 c7 mov %rax,%rdi 1a1: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a4: R_X86_64_32S core_restore_code 1a8: b9 00 02 00 00 mov $0x200,%ecx 1ad: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) And resolve the problem, however, none of this is important code and a much simpler solution still is to force a memcpy() call: 1a1: ba 00 10 00 00 mov $0x1000,%edx 1a6: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a9: R_X86_64_32S core_restore_code 1ad: e8 00 00 00 00 call 1b2 1ae: R_X86_64_PLT32 __memcpy-0x4 Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index e94e0050a583..6f955eb1e163 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -159,7 +159,7 @@ int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + -- cgit v1.2.3 From 4fd5f70ce14da230c6a29648c3d51a48ee0b4bfd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 1 Nov 2022 10:25:07 -0700 Subject: x86/Kconfig: Enable kernel IBT by default The kernel IBT defense strongly mitigates the common "first step" of ROP attacks, by eliminating arbitrary stack pivots (that appear either at the end of a function or in immediate values), which cannot be reached if indirect calls must be to marked function entry addresses. IBT is also required to be enabled to gain the FineIBT feature when built with Kernel Control Flow Integrity. Additionally, given that this feature is runtime enabled via CPU ID, it clearly should be built in by default; it will only be enabled if the CPU supports it. The build takes 2 seconds longer, which seems a small price to pay for gaining this coverage by default. Suggested-by: Sami Tolvanen Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221101172503.gonna.094-kees@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 479ee63898f5..aaf1f0f46161 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1856,7 +1856,7 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" - bool + def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 -- cgit v1.2.3 From b1599915f09157e98f59556e1b2eafe473603347 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 6 Nov 2022 09:55:56 +0100 Subject: x86/cpufeatures: Move X86_FEATURE_CALL_DEPTH from bit 18 to bit 19 of word 11, to leave space for WIP X86_FEATURE_SGX_EDECCSSA bit Reallocate a soft-cpufeatures bit allocated for call-depth tracking code, which clashes with this recent KVM/SGX patch being worked on: KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest Instead of reallocating cpufeatures bits in evil merges, make the allocation explicit. Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index aefd0816a333..864c9b0dda68 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,7 +304,8 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_CALL_DEPTH (11*32+18) /* "" Call depth tracking for RSB stuffing */ + /* Hole left for X86_FEATURE_SGX_EDECCSSA */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -- cgit v1.2.3 From 2d08a893b87cf9b2f9dbb3afaff60ca4530d55a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:17:07 +0000 Subject: x86/debug: Include percpu.h in debugreg.h to get DECLARE_PER_CPU() et al MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include percpu.h to pick up the definition of DECLARE_PER_CPU() and friends instead of relying on the parent to provide the #include. E.g. swapping the order of includes in arch/x86/kvm/vmx/nested.c (simulating KVM code movement being done for other purposes) results in build errors: In file included from arch/x86/kvm/vmx/nested.c:3: arch/x86/include/asm/debugreg.h:9:32: error: unknown type name ‘cpu_dr7â€=99 9 | DECLARE_PER_CPU(unsigned long, cpu_dr7); | ^~~~~~~ Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110201707.1976032-1-seanjc@google.com --- arch/x86/include/asm/debugreg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index cfdf307ddc01..b049d950612f 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,8 +2,8 @@ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H - #include +#include #include DECLARE_PER_CPU(unsigned long, cpu_dr7); -- cgit v1.2.3 From 5736b1b70170e15d66ec02e500db917ef42ade83 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 3 Sep 2022 00:37:06 -0700 Subject: x86/paravirt: Remove clobber bitmask from .parainstructions The u16 "clobber" value is not used in .parainstructions since commit 27876f3882fd ("x86/paravirt: Remove clobbers from struct paravirt_patch_site") Remove the u16 from the section macro, the argument from all macros, and all now-unused CLBR_* macros. Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220903073706.3193746-1-keescook@chromium.org --- arch/x86/include/asm/paravirt_types.h | 61 +++++++---------------------------- 1 file changed, 12 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 27c692791b7e..8c1da419260f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -20,37 +20,6 @@ enum paravirt_lazy_mode { #ifdef CONFIG_PARAVIRT -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) - -#endif /* X86_64 */ - #ifndef __ASSEMBLY__ #include @@ -297,27 +266,23 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ [paravirt_opptr] "m" (pv_ops.op) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - /* * Generate some code, and mark it as patchable by the * apply_paravirt() alternate instruction patcher. */ -#define _paravirt_alt(insn_string, type, clobber) \ +#define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ _ASM_PTR " 771b\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - " .short " clobber "\n" \ _ASM_ALIGN "\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ #define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + _paravirt_alt(insn_string, "%c[paravirt_typenum]") /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -469,20 +434,19 @@ int paravirt_disable_iospace(void); }) -#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ +#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(paravirt_alt(PARAVIRT_CALL) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) -#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \ extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ @@ -491,45 +455,44 @@ int paravirt_disable_iospace(void); alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) #define __PVOP_CALL(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ - ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_CALLEESAVE(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ - CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_VCALL(op, ...) \ - (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \ VEXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_VCALL(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + (void)____PVOP_ALT_CALL(, op, alt, cond, \ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_VCALLEESAVE(op, ...) \ - (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + (void)____PVOP_CALL(, op.func, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) -- cgit v1.2.3 From f1a033cc6b9eb6d80322008422df3c87aa5d47a0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 9 Nov 2022 14:44:18 +0100 Subject: x86/paravirt: Use common macro for creating simple asm paravirt functions There are some paravirt assembler functions which are sharing a common pattern. Introduce a macro DEFINE_PARAVIRT_ASM() for creating them. Note that this macro is including explicit alignment of the generated functions, leading to __raw_callee_save___kvm_vcpu_is_preempted(), _paravirt_nop() and paravirt_ret0() to be aligned at 4 byte boundaries now. The explicit _paravirt_nop() prototype in paravirt.c isn't needed, as it is included in paravirt_types.h already. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srivatsa S. Bhat (VMware) Link: https://lkml.kernel.org/r/20221109134418.6516-1-jgross@suse.com --- arch/x86/include/asm/paravirt.h | 12 ++++++++ arch/x86/include/asm/qspinlock_paravirt.h | 47 +++++++++++++------------------ arch/x86/kernel/kvm.c | 19 ++++--------- arch/x86/kernel/paravirt.c | 23 ++------------- 4 files changed, 40 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2851bc2339d5..73e9522db7c1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -731,6 +731,18 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + extern void default_banner(void); #else /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index d861127731f4..42b17cf10b10 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -14,8 +14,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock -#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" -#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock @@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .spinlock.text, \"ax\";" - ".globl " PV_UNLOCK ";" - ".type " PV_UNLOCK ", @function;" - ASM_FUNC_ALIGN - PV_UNLOCK ": " - ASM_ENDBR - FRAME_BEGIN - "push %rdx;" - "mov $0x1,%eax;" - "xor %edx,%edx;" - LOCK_PREFIX "cmpxchg %dl,(%rdi);" - "cmp $0x1,%al;" - "jne .slowpath;" - "pop %rdx;" +#define PV_UNLOCK_ASM \ + FRAME_BEGIN \ + "push %rdx\n\t" \ + "mov $0x1,%eax\n\t" \ + "xor %edx,%edx\n\t" \ + LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ + "cmp $0x1,%al\n\t" \ + "jne .slowpath\n\t" \ + "pop %rdx\n\t" \ + FRAME_END \ + ASM_RET \ + ".slowpath:\n\t" \ + "push %rsi\n\t" \ + "movzbl %al,%esi\n\t" \ + "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \ + "pop %rsi\n\t" \ + "pop %rdx\n\t" \ FRAME_END - ASM_RET - ".slowpath: " - "push %rsi;" - "movzbl %al,%esi;" - "call " PV_UNLOCK_SLOWPATH ";" - "pop %rsi;" - "pop %rdx;" - FRAME_END - ASM_RET - ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" - ".popsection"); + +DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 95fb85bea111..4d053cb2c48a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -798,20 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and * restoring to/from the stack. */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -ASM_FUNC_ALIGN -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); +#define PV_VCPU_PREEMPTED_ASM \ + "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ + "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ + "setne %al\n\t" +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e244c49b52d7..327757afb027 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,29 +37,10 @@ * nop stub, which must not clobber anything *including the stack* to * avoid confusing the entry prologues. */ -extern void _paravirt_nop(void); -asm (".pushsection .entry.text, \"ax\"\n" - ".global _paravirt_nop\n" - ASM_FUNC_ALIGN - "_paravirt_nop:\n\t" - ASM_ENDBR - ASM_RET - ".size _paravirt_nop, . - _paravirt_nop\n\t" - ".type _paravirt_nop, @function\n\t" - ".popsection"); +DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); /* stub always returning 0. */ -asm (".pushsection .entry.text, \"ax\"\n" - ".global paravirt_ret0\n" - ASM_FUNC_ALIGN - "paravirt_ret0:\n\t" - ASM_ENDBR - "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" - ASM_RET - ".size paravirt_ret0, . - paravirt_ret0\n\t" - ".type paravirt_ret0, @function\n\t" - ".popsection"); - +DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { -- cgit v1.2.3