From 93efbde2c331004d8053f04b4bf0ca3e630b474a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Nov 2019 22:12:38 -0800 Subject: x86/traps: Disentangle the 32-bit and 64-bit doublefault code The 64-bit doublefault handler is much nicer than the 32-bit one. As a first step toward unifying them, make the 64-bit handler self-contained. This should have no effect no functional effect except in the odd case of x86_64 with CONFIG_DOUBLEFAULT=n in which case it will change the logging a bit. This also gets rid of CONFIG_DOUBLEFAULT configurability on 64-bit kernels. It didn't do anything useful -- CONFIG_DOUBLEFAULT=n didn't actually disable doublefault handling on x86_64. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e51afbb0cbfb..f6c630097d9f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -997,7 +997,6 @@ bool xen_set_default_idle(void); #endif void stop_this_cpu(void *dummy); -void df_debug(struct pt_regs *regs, long error_code); void microcode_check(void); enum l1tf_mitigations { -- cgit v1.2.3 From dc4e0021b00b5a4ecba56fae509217776592b0aa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Nov 2019 18:27:16 +0100 Subject: x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area There are three problems with the current layout of the doublefault stack and TSS. First, the TSS is only cacheline-aligned, which is not enough -- if the hardware portion of the TSS (struct x86_hw_tss) crosses a page boundary, horrible things happen [0]. Second, the stack and TSS are global, so simultaneous double faults on different CPUs will cause massive corruption. Third, the whole mechanism won't work if user CR3 is loaded, resulting in a triple fault [1]. Let the doublefault stack and TSS share a page (which prevents the TSS from spanning a page boundary), make it percpu, and move it into cpu_entry_area. Teach the stack dump code about the doublefault stack. [0] Real hardware will read past the end of the page onto the next *physical* page if a task switch happens. Virtual machines may have any number of bugs, and I would consider it reasonable for a VM to summarily kill the guest if it tries to task-switch to a page-spanning TSS. [1] Real hardware triple faults. At least some VMs seem to hang. I'm not sure what's going on. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_entry_area.h | 12 +++++++ arch/x86/include/asm/doublefault.h | 13 ++++++++ arch/x86/include/asm/pgtable_32_types.h | 7 ++-- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 12 ++----- arch/x86/kernel/doublefault_32.c | 58 ++++++++++++++++++++++----------- arch/x86/kernel/dumpstack_32.c | 30 +++++++++++++++++ arch/x86/mm/cpu_entry_area.c | 14 +++++++- 8 files changed, 113 insertions(+), 34 deletions(-) create mode 100644 arch/x86/include/asm/doublefault.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index ea866c7bf31d..804734058c77 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -65,6 +65,13 @@ enum exception_stack_ordering { #endif +#ifdef CONFIG_X86_32 +struct doublefault_stack { + unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)]; + struct x86_hw_tss tss; +} __aligned(PAGE_SIZE); +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -86,6 +93,11 @@ struct cpu_entry_area { #endif struct entry_stack_page entry_stack_page; +#ifdef CONFIG_X86_32 + char guard_doublefault_stack[PAGE_SIZE]; + struct doublefault_stack doublefault_stack; +#endif + /* * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because * we need task switches to work, and task switches write to the TSS. diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h new file mode 100644 index 000000000000..af9a14ac8962 --- /dev/null +++ b/arch/x86/include/asm/doublefault.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_DOUBLEFAULT_H +#define _ASM_X86_DOUBLEFAULT_H + +#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +extern void doublefault_init_cpu_tss(void); +#else +static inline void doublefault_init_cpu_tss(void) +{ +} +#endif + +#endif /* _ASM_X86_DOUBLEFAULT_H */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 19f5807260c3..0416d42e5bdd 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #endif /* - * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c - * to avoid include recursion hell + * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE. + * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c + * to avoid include recursion hell. */ -#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41) +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43) /* The +1 is for the readonly IDT page: */ #define CPU_ENTRY_AREA_BASE \ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f6c630097d9f..0340aad3f2fc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -166,7 +166,6 @@ enum cpuid_regs_idx { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct x86_hw_tss doublefault_tss; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index baa2fed8deb6..2e4d90294fe6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } -static inline void gdt_setup_doublefault_tss(int cpu) { } - #else /* CONFIG_X86_64 */ static inline void setup_getcpu(int cpu) { } @@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu) static inline void tss_setup_ist(struct tss_struct *tss) { } -static inline void gdt_setup_doublefault_tss(int cpu) -{ -#ifdef CONFIG_DOUBLEFAULT - /* Set up the doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif -} #endif /* !CONFIG_X86_64 */ static inline void tss_setup_io_bitmap(struct tss_struct *tss) @@ -1923,7 +1915,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - gdt_setup_doublefault_tss(cpu); + doublefault_init_cpu_tss(); fpu__init_cpu(); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 61c707ca8a09..4eecfe4825ed 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -10,10 +10,6 @@ #include #include -#define DOUBLEFAULT_STACKSIZE (1024) -static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; -#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) - #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) static void doublefault_fn(void) @@ -21,6 +17,8 @@ static void doublefault_fn(void) struct desc_ptr gdt_desc = {0, 0}; unsigned long gdt, tss; + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + native_store_gdt(&gdt_desc); gdt = gdt_desc.address; @@ -48,24 +46,46 @@ static void doublefault_fn(void) cpu_relax(); } -struct x86_hw_tss doublefault_tss __cacheline_aligned = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, #ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, + .gs = __KERNEL_STACK_CANARY, #endif - .__cr3 = __pa_nodebug(swapper_pg_dir), + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, }; + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 64a59d726639..8e3a8fedfa4d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_ENTRY) return "ENTRY_TRAMPOLINE"; + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + return NULL; } @@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) return true; } +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +#else + return false; +#endif +} + + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { @@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_softirq_stack(stack, info)) goto recursion_check; + if (in_doublefault_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 82ead8e27888..56f9189bbadb 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif +#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); +#endif + struct cpu_entry_area *get_cpu_entry_area(int cpu) { unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; @@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) cea_map_stack(MCE); } #else -static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +static inline void percpu_setup_exception_stacks(unsigned int cpu) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + cea_map_percpu_pages(&cea->doublefault_stack, + &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); +#endif +} #endif /* Setup the fixmap mappings only once per-processor */ -- cgit v1.2.3 From 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Nov 2019 23:06:41 -0800 Subject: x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit The old x86_32 doublefault_fn() was old and crufty, and it did not even try to recover. do_double_fault() is much nicer. Rewrite the 32-bit double fault code to sanitize CPU state and call do_double_fault(). This is mostly an exercise i386 archaeology. With this patch applied, 32-bit double faults get a real stack trace, just like 64-bit double faults. [ mingo: merged the patch to a later kernel base. ] Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 42 +++++++++++++++ arch/x86/include/asm/traps.h | 3 ++ arch/x86/kernel/doublefault_32.c | 107 +++++++++++++++++++++++++++------------ arch/x86/kernel/traps.c | 19 ++++++- 4 files changed, 138 insertions(+), 33 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 5832b11f01bb..632432bb723d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1537,6 +1537,48 @@ SYM_CODE_START(debug) jmp common_exception SYM_CODE_END(debug) +#ifdef CONFIG_DOUBLEFAULT +SYM_CODE_START(double_fault) +1: + /* + * This is a task gate handler, not an interrupt gate handler. + * The error code is on the stack, but the stack is otherwise + * empty. Interrupts are off. Our state is sane with the following + * exceptions: + * + * - CR0.TS is set. "TS" literally means "task switched". + * - EFLAGS.NT is set because we're a "nested task". + * - The doublefault TSS has back_link set and has been marked busy. + * - TR points to the doublefault TSS and the normal TSS is busy. + * - CR3 is the normal kernel PGD. This would be delightful, except + * that the CPU didn't bother to save the old CR3 anywhere. This + * would make it very awkward to return back to the context we came + * from. + * + * The rest of EFLAGS is sanitized for us, so we don't need to + * worry about AC or DF. + * + * Don't even bother popping the error code. It's always zero, + * and ignoring it makes us a bit more robust against buggy + * hypervisor task gate implementations. + * + * We will manually undo the task switch instead of doing a + * task-switching IRET. + */ + + clts /* clear CR0.TS */ + pushl $X86_EFLAGS_FIXED + popfl /* clear EFLAGS.NT */ + + call doublefault_shim + + /* We don't support returning, so we have no IRET here. */ +1: + hlt + jmp 1b +SYM_CODE_END(double_fault) +#endif + /* * NMI is doubly nasty. It can happen on the first instruction of * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index b25e633033c3..ffa0dc8a535e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); +#endif dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 4eecfe4825ed..3793646f0fb5 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -9,42 +9,83 @@ #include #include #include +#include +extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) - BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); +static void set_df_gdt_entry(unsigned int cpu); - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage notrace void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + cr2 = native_read_cr2(); - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); + trace_hardirqs_off(); - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from do_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + do_double_fault(®s, 0, cr2); - for (;;) - cpu_relax(); + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); } +NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, + .ip = (unsigned long) double_fault, + .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, @@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { }, }; +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + void doublefault_init_cpu_tss(void) { unsigned int cpu = smp_processor_id(); @@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void) (unsigned long)&cea->doublefault_stack.stack + sizeof(doublefault_stack.stack)); - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, - &get_cpu_entry_area(cpu)->doublefault_stack.tss); - + set_df_gdt_entry(cpu); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 76381b04dc93..a9b16c3a933d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; -- cgit v1.2.3 From 59c4bd853abcea95eccc167a7d7fd5f1a5f47b98 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 28 Nov 2019 09:53:06 +0100 Subject: x86/fpu: Don't cache access to fpu_fpregs_owner_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The state/owner of the FPU is saved to fpu_fpregs_owner_ctx by pointing to the context that is currently loaded. It never changed during the lifetime of a task - it remained stable/constant. After deferred FPU registers loading until return to userland was implemented, the content of fpu_fpregs_owner_ctx may change during preemption and must not be cached. This went unnoticed for some time and was now noticed, in particular since gcc 9 is caching that load in copy_fpstate_to_sigframe() and reusing it in the retry loop: copy_fpstate_to_sigframe() load fpu_fpregs_owner_ctx and save on stack fpregs_lock() copy_fpregs_to_sigframe() /* failed */ fpregs_unlock() *** PREEMPTION, another uses FPU, changes fpu_fpregs_owner_ctx *** fault_in_pages_writeable() /* succeed, retry */ fpregs_lock() __fpregs_load_activate() fpregs_state_valid() /* uses fpu_fpregs_owner_ctx from stack */ copy_fpregs_to_sigframe() /* succeeds, random FPU content */ This is a comparison of the assembly produced by gcc 9, without vs with this patch: | # arch/x86/kernel/fpu/signal.c:173: if (!access_ok(buf, size)) | cmpq %rdx, %rax # tmp183, _4 | jb .L190 #, |-# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |-#APP |-# 512 "arch/x86/include/asm/fpu/internal.h" 1 |- movq %gs:fpu_fpregs_owner_ctx,%rax #, pfo_ret__ |-# 0 "" 2 |-#NO_APP |- movq %rax, -88(%rbp) # pfo_ret__, %sfp … |-# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |- movq -88(%rbp), %rcx # %sfp, pfo_ret__ |- cmpq %rcx, -64(%rbp) # pfo_ret__, %sfp |+# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |+#APP |+# 512 "arch/x86/include/asm/fpu/internal.h" 1 |+ movq %gs:fpu_fpregs_owner_ctx(%rip),%rax # fpu_fpregs_owner_ctx, pfo_ret__ |+# 0 "" 2 |+# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |+#NO_APP |+ cmpq %rax, -64(%rbp) # pfo_ret__, %sfp Use this_cpu_read() instead this_cpu_read_stable() to avoid caching of fpu_fpregs_owner_ctx during preemption points. The Fixes: tag points to the commit where deferred FPU loading was added. Since this commit, the compiler is no longer allowed to move the load of fpu_fpregs_owner_ctx somewhere else / outside of the locked section. A task preemption will change its value and stale content will be observed. [ bp: Massage. ] Debugged-by: Austin Clements Debugged-by: David Chase Debugged-by: Ian Lance Taylor Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Tested-by: Borislav Petkov Cc: Aubrey Li Cc: Austin Clements Cc: Barret Rhoden Cc: Dave Hansen Cc: David Chase Cc: "H. Peter Anvin" Cc: ian@airs.com Cc: Ingo Molnar Cc: Josh Bleecher Snyder Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20191128085306.hxfa2o3knqtu4wfn@linutronix.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=205663 --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4c95c365058a..44c48e34d799 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } /* -- cgit v1.2.3