x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit

The old x86_32 doublefault_fn() was old and crufty, and it did not even try to recover. do_double_fault() is much nicer. Rewrite the 32-bit double fault code to sanitize CPU state and call do_double_fault(). This is mostly an exercise i386 archaeology. With this patch applied, 32-bit double faults get a real stack trace, just like 64-bit double faults. [ mingo: merged the patch to a later kernel base. ] Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Andy Lutomirski <luto@kernel.org> 2019-11-20 23:06:41 -0800
committer: Ingo Molnar <mingo@kernel.org> 2019-11-26 22:00:04 +0100
commit: 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e (patch)
tree: e84496231e428d837eba299a378e5dafa31e1849 /arch/x86
parent: dc4e0021b00b5a4ecba56fae509217776592b0aa (diff)
download: linux-7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e.tar.bz2
4 files changed, 138 insertions, 33 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 5832b11f01bb..632432bb723d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1537,6 +1537,48 @@ SYM_CODE_START(debug)
 	jmp	common_exception
 SYM_CODE_END(debug)
 
+#ifdef CONFIG_DOUBLEFAULT
+SYM_CODE_START(double_fault)
+1:
+	/*
+	 * This is a task gate handler, not an interrupt gate handler.
+	 * The error code is on the stack, but the stack is otherwise
+	 * empty.  Interrupts are off.  Our state is sane with the following
+	 * exceptions:
+	 *
+	 *  - CR0.TS is set.  "TS" literally means "task switched".
+	 *  - EFLAGS.NT is set because we're a "nested task".
+	 *  - The doublefault TSS has back_link set and has been marked busy.
+	 *  - TR points to the doublefault TSS and the normal TSS is busy.
+	 *  - CR3 is the normal kernel PGD.  This would be delightful, except
+	 *    that the CPU didn't bother to save the old CR3 anywhere.  This
+	 *    would make it very awkward to return back to the context we came
+	 *    from.
+	 *
+	 * The rest of EFLAGS is sanitized for us, so we don't need to
+	 * worry about AC or DF.
+	 *
+	 * Don't even bother popping the error code.  It's always zero,
+	 * and ignoring it makes us a bit more robust against buggy
+	 * hypervisor task gate implementations.
+	 *
+	 * We will manually undo the task switch instead of doing a
+	 * task-switching IRET.
+	 */
+
+	clts				/* clear CR0.TS */
+	pushl	$X86_EFLAGS_FIXED
+	popfl				/* clear EFLAGS.NT */
+
+	call	doublefault_shim
+
+	/* We don't support returning, so we have no IRET here. */
+1:
+	hlt
+	jmp 1b
+SYM_CODE_END(double_fault)
+#endif
+
 /*
  * NMI is doubly nasty.  It can happen on the first instruction of
  * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b25e633033c3..ffa0dc8a535e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
 dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
 dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
+#endif
 dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
 dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
 dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 4eecfe4825ed..3793646f0fb5 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -9,42 +9,83 @@
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/traps.h>
 
+extern void double_fault(void);
 #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
 
-static void doublefault_fn(void)
-{
-	struct desc_ptr gdt_desc = {0, 0};
-	unsigned long gdt, tss;
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
 
-	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+static void set_df_gdt_entry(unsigned int cpu);
 
-	native_store_gdt(&gdt_desc);
-	gdt = gdt_desc.address;
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared.  The CPU thinks
+ * we're running the doublefault task.  Cannot return.
+ */
+asmlinkage notrace void __noreturn doublefault_shim(void)
+{
+	unsigned long cr2;
+	struct pt_regs regs;
 
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
 
-	if (ptr_ok(gdt)) {
-		gdt += GDT_ENTRY_TSS << 3;
-		tss = get_desc_base((struct desc_struct *)gdt);
-		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+	cr2 = native_read_cr2();
 
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+	/* Reset back to the normal kernel task. */
+	force_reload_TR();
+	set_df_gdt_entry(smp_processor_id());
 
-			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
-			       t->ip, t->sp);
+	trace_hardirqs_off();
 
-			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
-		}
-	}
+	/*
+	 * Fill in pt_regs.  A downside of doing this in C is that the unwinder
+	 * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+	 * won't successfully unwind to the source of the double fault.
+	 * The main dump from do_double_fault() is fine, though, since it
+	 * uses these regs directly.
+	 *
+	 * If anyone ever cares, this could be moved to asm.
+	 */
+	regs.ss		= TSS(ss);
+	regs.__ssh	= 0;
+	regs.sp		= TSS(sp);
+	regs.flags	= TSS(flags);
+	regs.cs		= TSS(cs);
+	/* We won't go through the entry asm, so we can leave __csh as 0. */
+	regs.__csh	= 0;
+	regs.ip		= TSS(ip);
+	regs.orig_ax	= 0;
+	regs.gs		= TSS(gs);
+	regs.__gsh	= 0;
+	regs.fs		= TSS(fs);
+	regs.__fsh	= 0;
+	regs.es		= TSS(es);
+	regs.__esh	= 0;
+	regs.ds		= TSS(ds);
+	regs.__dsh	= 0;
+	regs.ax		= TSS(ax);
+	regs.bp		= TSS(bp);
+	regs.di		= TSS(di);
+	regs.si		= TSS(si);
+	regs.dx		= TSS(dx);
+	regs.cx		= TSS(cx);
+	regs.bx		= TSS(bx);
+
+	do_double_fault(&regs, 0, cr2);
 
-	for (;;)
-		cpu_relax();
+	/*
+	 * x86_32 does not save the original CR3 anywhere on a task switch.
+	 * This means that, even if we wanted to return, we would need to find
+	 * some way to reconstruct CR3.  We could make a credible guess based
+	 * on cpu_tlbstate, but that would be racy and would not account for
+	 * PTI.
+	 *
+	 * Instead, don't bother.  We can return through
+	 * rewind_stack_do_exit() instead.
+	 */
+	panic("cannot return from double fault\n");
 }
+NOKPROBE_SYMBOL(doublefault_shim);
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 	.tss = {
@@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 		.ldt		= 0,
 	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
+		.ip		= (unsigned long) double_fault,
+		.flags		= X86_EFLAGS_FIXED,
 		.es		= __USER_DS,
 		.cs		= __KERNEL_CS,
 		.ss		= __KERNEL_DS,
@@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 	},
 };
 
+static void set_df_gdt_entry(unsigned int cpu)
+{
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
 void doublefault_init_cpu_tss(void)
 {
 	unsigned int cpu = smp_processor_id();
@@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void)
                        (unsigned long)&cea->doublefault_stack.stack +
                        sizeof(doublefault_stack.stack));
 
-	/* Set up doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
-		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
-
+	set_df_gdt_entry(cpu);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 76381b04dc93..a9b16c3a933d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
 }
 #endif
 
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry.  Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected.  Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS.  Returning is, in principle, okay, but changes to regs will
+ * be lost.  If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
 {
 	static const char str[] = "double fault";
author	Andy Lutomirski <luto@kernel.org>	2019-11-20 23:06:41 -0800
committer	Ingo Molnar <mingo@kernel.org>	2019-11-26 22:00:04 +0100
commit	7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e (patch)
tree	e84496231e428d837eba299a378e5dafa31e1849 /arch/x86
parent	dc4e0021b00b5a4ecba56fae509217776592b0aa (diff)
download	linux-7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e.tar.bz2