From 6950e31b35fdf4588cbbdec1813091bb02cf8871 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:05 -0800
Subject: x86/efi: Push EFI_MEMMAP check into leaf routines

In preparation for adding another EFI_MEMMAP dependent call that needs
to occur before e820__memblock_setup() fixup the existing efi calls to
check for EFI_MEMMAP internally. This ends up being cleaner than the
alternative of checking EFI_MEMMAP multiple times in setup_arch().

Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/setup.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 77ea96b794bd..1c4b866bc184 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1122,17 +1122,15 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_bios_regions();
 
-	if (efi_enabled(EFI_MEMMAP)) {
-		efi_fake_memmap();
-		efi_find_mirror();
-		efi_esrt_init();
+	efi_fake_memmap();
+	efi_find_mirror();
+	efi_esrt_init();
 
-		/*
-		 * The EFI specification says that boot service code won't be
-		 * called after ExitBootServices(). This is, in fact, a lie.
-		 */
-		efi_reserve_boot_services();
-	}
+	/*
+	 * The EFI specification says that boot service code won't be
+	 * called after ExitBootServices(). This is, in fact, a lie.
+	 */
+	efi_reserve_boot_services();
 
 	/* preallocate 4k for mptable mpc */
 	e820__memblock_alloc_reserved_mpc_new();
-- 
cgit v1.2.3


From 262b45ae3ab4bf8e2caf1fcfd0d8307897519630 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Nov 2019 17:43:16 -0800
Subject: x86/efi: EFI soft reservation to E820 enumeration

UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
interpretation of the EFI Memory Types as "reserved for a specific
purpose".

The proposed Linux behavior for specific purpose memory is that it is
reserved for direct-access (device-dax) by default and not available for
any kernel usage, not even as an OOM fallback.  Later, through udev
scripts or another init mechanism, these device-dax claimed ranges can
be reconfigured and hot-added to the available System-RAM with a unique
node identifier. This device-dax management scheme implements "soft" in
the "soft reserved" designation by allowing some or all of the
reservation to be recovered as typical memory. This policy can be
disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with
efi=nosoftreserve.

This patch introduces 2 new concepts at once given the entanglement
between early boot enumeration relative to memory that can optionally be
reserved from the kernel page allocator by default. The new concepts
are:

- E820_TYPE_SOFT_RESERVED: Upon detecting the EFI_MEMORY_SP
  attribute on EFI_CONVENTIONAL memory, update the E820 map with this
  new type. Only perform this classification if the
  CONFIG_EFI_SOFT_RESERVE=y policy is enabled, otherwise treat it as
  typical ram.

- IORES_DESC_SOFT_RESERVED: Add a new I/O resource descriptor for
  a device driver to search iomem resources for application specific
  memory. Teach the iomem code to identify such ranges as "Soft Reserved".

Note that the comment for do_add_efi_memmap() needed refreshing since it
seemed to imply that the efi map might overflow the e820 table, but that
is not an issue as of commit 7b6e4ba3cb1f "x86/boot/e820: Clean up the
E820_X_MAX definition" that removed the 128 entry limit for
e820__range_add().

A follow-on change integrates parsing of the ACPI HMAT to identify the
node and sub-range boundaries of EFI_MEMORY_SP designated memory. For
now, just identify and reserve memory of this type.

Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/boot/compressed/eboot.c  |  6 ++++-
 arch/x86/boot/compressed/kaslr.c  |  4 ++++
 arch/x86/include/asm/e820/types.h |  8 +++++++
 arch/x86/kernel/e820.c            | 12 ++++++++--
 arch/x86/platform/efi/efi.c       | 49 +++++++++++++++++++++++++++++++++++----
 include/linux/ioport.h            |  1 +
 6 files changed, 73 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 82bc60c8acb2..f2db8c5e4b06 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -554,7 +554,11 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			e820_type = E820_TYPE_RAM;
+			if (efi_soft_reserve_enabled() &&
+			    (d->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else
+				e820_type = E820_TYPE_RAM;
 			break;
 
 		case EFI_ACPI_MEMORY_NVS:
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 2e53c056ba20..ff6fa81949cd 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -760,6 +760,10 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
 		if (md->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (md->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (efi_mirror_found &&
 		    !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
 			continue;
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index c3aa4b5e49e2..314f75d886d0 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -28,6 +28,14 @@ enum e820_type {
 	 */
 	E820_TYPE_PRAM		= 12,
 
+	/*
+	 * Special-purpose memory is indicated to the system via the
+	 * EFI_MEMORY_SP attribute. Define an e820 translation of this
+	 * memory type for the purpose of reserving this range and
+	 * marking it with the IORES_DESC_SOFT_RESERVED designation.
+	 */
+	E820_TYPE_SOFT_RESERVED	= 0xefffffff,
+
 	/*
 	 * Reserved RAM used by the kernel itself if
 	 * CONFIG_INTEL_TXT=y is enabled, memory of this type
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7da2bcd2b8eb..9976106b57ec 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type)
 	case E820_TYPE_RAM:		/* Fall through: */
 	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
 	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
+	case E820_TYPE_SOFT_RESERVED:	pr_cont("soft reserved");		break;
 	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
 	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
 	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
@@ -1037,6 +1038,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
 	case E820_TYPE_PMEM:		return "Persistent Memory";
 	case E820_TYPE_RESERVED:	return "Reserved";
+	case E820_TYPE_SOFT_RESERVED:	return "Soft Reserved";
 	default:			return "Unknown E820 type";
 	}
 }
@@ -1052,6 +1054,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		/* Fall-through: */
 	case E820_TYPE_PMEM:		/* Fall-through: */
 	case E820_TYPE_RESERVED:	/* Fall-through: */
+	case E820_TYPE_SOFT_RESERVED:	/* Fall-through: */
 	default:			return IORESOURCE_MEM;
 	}
 }
@@ -1064,6 +1067,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
 	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
+	case E820_TYPE_SOFT_RESERVED:	return IORES_DESC_SOFT_RESERVED;
 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
 	case E820_TYPE_RAM:		/* Fall-through: */
 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
@@ -1078,11 +1082,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
 		return true;
 
 	/*
-	 * Treat persistent memory like device memory, i.e. reserve it
-	 * for exclusive use of a driver
+	 * Treat persistent memory and other special memory ranges like
+	 * device memory, i.e. reserve it for exclusive use of a driver
 	 */
 	switch (type) {
 	case E820_TYPE_RESERVED:
+	case E820_TYPE_SOFT_RESERVED:
 	case E820_TYPE_PRAM:
 	case E820_TYPE_PMEM:
 		return false;
@@ -1285,6 +1290,9 @@ void __init e820__memblock_setup(void)
 		if (end != (resource_size_t)end)
 			continue;
 
+		if (entry->type == E820_TYPE_SOFT_RESERVED)
+			memblock_reserve(entry->addr, entry->size);
+
 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 			continue;
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e6e41b118d68..8609dccea096 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -148,14 +148,18 @@ void __init efi_find_mirror(void)
 
 /*
  * Tell the kernel about the EFI memory map.  This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
+ * more than the max 128 entries that can fit in the passed in e820
+ * legacy (zeropage) memory map, but the kernel's e820 table can hold
+ * E820_MAX_ENTRIES.
  */
 
 static void __init do_add_efi_memmap(void)
 {
 	efi_memory_desc_t *md;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
@@ -167,7 +171,10 @@ static void __init do_add_efi_memmap(void)
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			if (md->attribute & EFI_MEMORY_WB)
+			if (efi_soft_reserve_enabled()
+			    && (md->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else if (md->attribute & EFI_MEMORY_WB)
 				e820_type = E820_TYPE_RAM;
 			else
 				e820_type = E820_TYPE_RESERVED;
@@ -193,11 +200,36 @@ static void __init do_add_efi_memmap(void)
 			e820_type = E820_TYPE_RESERVED;
 			break;
 		}
+
 		e820__range_add(start, size, e820_type);
 	}
 	e820__update_table(e820_table);
 }
 
+/*
+ * Given add_efi_memmap defaults to 0 and there there is no alternative
+ * e820 mechanism for soft-reserved memory, import the full EFI memory
+ * map if soft reservations are present and enabled. Otherwise, the
+ * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is
+ * the efi=nosoftreserve option.
+ */
+static bool do_efi_soft_reserve(void)
+{
+	efi_memory_desc_t *md;
+
+	if (!efi_enabled(EFI_MEMMAP))
+		return false;
+
+	if (!efi_soft_reserve_enabled())
+		return false;
+
+	for_each_efi_memory_desc(md)
+		if (md->type == EFI_CONVENTIONAL_MEMORY &&
+		    (md->attribute & EFI_MEMORY_SP))
+			return true;
+	return false;
+}
+
 int __init efi_memblock_x86_reserve_range(void)
 {
 	struct efi_info *e = &boot_params.efi_info;
@@ -227,7 +259,7 @@ int __init efi_memblock_x86_reserve_range(void)
 	if (rv)
 		return rv;
 
-	if (add_efi_memmap)
+	if (add_efi_memmap || do_efi_soft_reserve())
 		do_add_efi_memmap();
 
 	WARN(efi.memmap.desc_version != 1,
@@ -781,6 +813,15 @@ static bool should_map_region(efi_memory_desc_t *md)
 	if (IS_ENABLED(CONFIG_X86_32))
 		return false;
 
+	/*
+	 * EFI specific purpose memory may be reserved by default
+	 * depending on kernel config and boot options.
+	 */
+	if (md->type == EFI_CONVENTIONAL_MEMORY &&
+	    efi_soft_reserve_enabled() &&
+	    (md->attribute & EFI_MEMORY_SP))
+		return false;
+
 	/*
 	 * Map all of RAM so that we can access arguments in the 1:1
 	 * mapping when making EFI runtime calls.
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 7bddddfc76d6..a9b9170b5dd2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -134,6 +134,7 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
+	IORES_DESC_SOFT_RESERVED		= 8,
 };
 
 /*
-- 
cgit v1.2.3


From 562955fe6a558b9ef98ad87c470314946338cb2f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 13:11:39 -0500
Subject: ftrace/x86: Add register_ftrace_direct() for custom trampolines

Enable x86 to allow for register_ftrace_direct(), where a custom trampoline
may be called directly from an ftrace mcount/fentry location.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/Kconfig              |  1 +
 arch/x86/include/asm/ftrace.h | 13 +++++++++++++
 arch/x86/kernel/ftrace.c      | 12 ++++++++++++
 arch/x86/kernel/ftrace_64.S   | 34 +++++++++++++++++++++++++++-------
 include/linux/ftrace.h        |  6 ++++++
 5 files changed, 59 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6e1faa28c58..329d9c729ba3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -158,6 +158,7 @@ config X86
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EISA
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index c38a66661576..c2a7458f912c 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,6 +28,19 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
+/*
+ * When a ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this, we
+ * place the direct caller in the ORIG_AX part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+	/* Emulate a call */
+	regs->orig_ax = addr;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 struct dyn_arch_ftrace {
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbba..fef283f6341d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1042,6 +1042,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		return;
 
+	/*
+	 * If the return location is actually pointing directly to
+	 * the start of a direct trampoline (if we trace the trampoline
+	 * it will still be offset by MCOUNT_INSN_SIZE), then the
+	 * return address is actually off by one word, and we
+	 * need to adjust for that.
+	 */
+	if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) {
+		self_addr = *parent;
+		parent++;
+	}
+
 	/*
 	 * Protect against fault, even if it shouldn't
 	 * happen. This tool is too much intrusive to
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 809d54397dba..6ac7ff304886 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -88,6 +88,7 @@ EXPORT_SYMBOL(__fentry__)
 	movq %rdi, RDI(%rsp)
 	movq %r8, R8(%rsp)
 	movq %r9, R9(%rsp)
+	movq $0, ORIG_RAX(%rsp)
 	/*
 	 * Save the original RBP. Even though the mcount ABI does not
 	 * require this, it helps out callers.
@@ -114,7 +115,11 @@ EXPORT_SYMBOL(__fentry__)
 	subq $MCOUNT_INSN_SIZE, %rdi
 	.endm
 
-.macro restore_mcount_regs
+.macro restore_mcount_regs save=0
+
+	/* ftrace_regs_caller or frame pointers require this */
+	movq RBP(%rsp), %rbp
+
 	movq R9(%rsp), %r9
 	movq R8(%rsp), %r8
 	movq RDI(%rsp), %rdi
@@ -123,10 +128,7 @@ EXPORT_SYMBOL(__fentry__)
 	movq RCX(%rsp), %rcx
 	movq RAX(%rsp), %rax
 
-	/* ftrace_regs_caller can modify %rbp */
-	movq RBP(%rsp), %rbp
-
-	addq $MCOUNT_REG_SIZE, %rsp
+	addq $MCOUNT_REG_SIZE-\save, %rsp
 
 	.endm
 
@@ -228,10 +230,28 @@ GLOBAL(ftrace_regs_call)
 	movq R10(%rsp), %r10
 	movq RBX(%rsp), %rbx
 
-	restore_mcount_regs
+	movq ORIG_RAX(%rsp), %rax
+	movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+	/* If ORIG_RAX is anything but zero, make this a call to that */
+	movq ORIG_RAX(%rsp), %rax
+	cmpq	$0, %rax
+	je	1f
+
+	/* Swap the flags with orig_rax */
+	movq MCOUNT_REG_SIZE(%rsp), %rdi
+	movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+	movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+	restore_mcount_regs 8
+
+	jmp	2f
+
+1:	restore_mcount_regs
+
 
 	/* Restore flags */
-	popfq
+2:	popfq
 
 	/*
 	 * As this jmp to ftrace_epilogue can be a short jump
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8b37b8105398..2bc7bd6b8387 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -272,6 +272,12 @@ static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long a
  * via ftrace (because there's other callbacks besides the
  * direct call), can inform the architecture's trampoline that this
  * routine has a direct caller, and what the caller is.
+ *
+ * For example, in x86, it returns the direct caller
+ * callback function via the regs->orig_ax parameter.
+ * Then in the ftrace trampoline, if this is set, it makes
+ * the return from the trampoline jump to the direct caller
+ * instead of going back to the function it just traced.
  */
 static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs,
 						 unsigned long addr) { }
-- 
cgit v1.2.3


From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 13:12:57 -0500
Subject: ftrace/x86: Add a counter to test function_graph with direct

As testing for direct calls from the function graph tracer adds a little
overhead (which is a lot when tracing every function), add a counter that
can be used to test if function_graph tracer needs to test for a direct
caller or not.

It would have been nicer if we could use a static branch, but the static
branch logic fails when used within the function graph tracer trampoline.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 8 +++++---
 include/linux/ftrace.h   | 2 ++
 kernel/trace/ftrace.c    | 4 ++++
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index fef283f6341d..060a361d9d11 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1049,9 +1049,11 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 	 * return address is actually off by one word, and we
 	 * need to adjust for that.
 	 */
-	if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) {
-		self_addr = *parent;
-		parent++;
+	if (ftrace_direct_func_count) {
+		if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) {
+			self_addr = *parent;
+			parent++;
+		}
 	}
 
 	/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2bc7bd6b8387..55647e185141 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -247,10 +247,12 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) {
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+extern int ftrace_direct_func_count;
 int register_ftrace_direct(unsigned long ip, unsigned long addr);
 int unregister_ftrace_direct(unsigned long ip, unsigned long addr);
 struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr);
 #else
+# define ftrace_direct_func_count 0
 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr)
 {
 	return -ENODEV;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c4446eabacbe..f9456346ec66 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
 /* Protected by rcu_tasks for reading, and direct_mutex for writing */
 static struct ftrace_hash *direct_functions = EMPTY_HASH;
 static DEFINE_MUTEX(direct_mutex);
+int ftrace_direct_func_count;
 
 /*
  * Search the direct_functions hash to see if the given instruction pointer
@@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 		direct->addr = addr;
 		direct->count = 0;
 		list_add_rcu(&direct->next, &ftrace_direct_funcs);
+		ftrace_direct_func_count++;
 	}
 
 	entry->ip = ip;
@@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 			if (free_hash)
 				free_ftrace_hash(free_hash);
 			free_hash = NULL;
+			ftrace_direct_func_count--;
 		}
 	} else {
 		if (!direct->count)
@@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 			list_del_rcu(&direct->next);
 			synchronize_rcu_tasks();
 			kfree(direct);
+			ftrace_direct_func_count--;
 		}
 	}
  out_unlock:
-- 
cgit v1.2.3


From 77ac117b3a82251b109ffc5daf7d1c5392734be3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 8 Nov 2019 16:51:00 -0600
Subject: ftrace/x86: Tell objtool to ignore nondeterministic ftrace stack
 layout

Objtool complains about the new ftrace direct trampoline code:

  arch/x86/kernel/ftrace_64.o: warning: objtool: ftrace_regs_caller()+0x190: stack state mismatch: cfa1=7+16 cfa2=7+24

Typically, code has a deterministic stack layout, such that at a given
instruction address, the stack frame size is always the same.

That's not the case for the new ftrace_regs_caller() code after it
adjusts the stack for the direct case.  Just plead ignorance and assume
it's always the non-direct path.  Note this creates a tiny window for
ORC to get confused.

Link: http://lkml.kernel.org/r/20191108225100.ea3bhsbdf6oerj6g@treble

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/include/asm/unwind_hints.h |  8 ++++++++
 arch/x86/kernel/ftrace_64.S         | 12 +++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 0bcdb1279361..f5e2eb12cb71 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -86,6 +86,14 @@
 	UNWIND_HINT sp_offset=\sp_offset
 .endm
 
+.macro UNWIND_HINT_SAVE
+	UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE
+.endm
+
+.macro UNWIND_HINT_RESTORE
+	UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE
+.endm
+
 #else /* !__ASSEMBLY__ */
 
 #define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 6ac7ff304886..b33abdd0a2db 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -178,6 +178,8 @@ ENTRY(ftrace_regs_caller)
 	/* Save the current flags before any operations that can change them */
 	pushfq
 
+	UNWIND_HINT_SAVE
+
 	/* added 8 bytes to save flags */
 	save_mcount_regs 8
 	/* save_mcount_regs fills in first two parameters */
@@ -250,8 +252,16 @@ GLOBAL(ftrace_regs_call)
 1:	restore_mcount_regs
 
 
+2:
+	/*
+	 * The stack layout is nondetermistic here, depending on which path was
+	 * taken.  This confuses objtool and ORC, rightfully so.  For now,
+	 * pretend the stack always looks like the non-direct case.
+	 */
+	UNWIND_HINT_RESTORE
+
 	/* Restore flags */
-2:	popfq
+	popfq
 
 	/*
 	 * As this jmp to ftrace_epilogue can be a short jump
-- 
cgit v1.2.3


From 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2019 17:38:58 +0100
Subject: dma-direct: exclude dma_direct_map_resource from the min_low_pfn
 check

The valid memory address check in dma_capable only makes sense when mapping
normal memory, not when using dma_map_resource to map a device resource.
Add a new boolean argument to dma_capable to exclude that check for the
dma_map_resource case.

Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 arch/x86/kernel/amd_gart_64.c | 4 ++--
 drivers/xen/swiotlb-xen.c     | 4 ++--
 include/linux/dma-direct.h    | 5 +++--
 kernel/dma/direct.c           | 4 ++--
 kernel/dma/swiotlb.c          | 2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index a6ac3712db8b..5cfab41e8509 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 static inline int
 need_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return force_iommu || !dma_capable(dev, addr, size);
+	return force_iommu || !dma_capable(dev, addr, size, true);
 }
 
 static inline int
 nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return !dma_capable(dev, addr, size);
+	return !dma_capable(dev, addr, size, true);
 }
 
 /* Map a single continuous physical area into the IOMMU.
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 3f8b2cdb4acb..b6d27762c6f8 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -375,7 +375,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (dma_capable(dev, dev_addr, size) &&
+	if (dma_capable(dev, dev_addr, size, true) &&
 	    !range_straddles_page_boundary(phys, size) &&
 		!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
 		swiotlb_force != SWIOTLB_FORCE)
@@ -397,7 +397,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	/*
 	 * Ensure that the address returned is DMA'ble
 	 */
-	if (unlikely(!dma_capable(dev, dev_addr, size))) {
+	if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, map, size, size, dir,
 				attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return DMA_MAPPING_ERROR;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index f8959f75e496..99b77dd5f79b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -49,14 +49,15 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return __sme_clr(__dma_to_phys(dev, daddr));
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
+		bool is_ram)
 {
 	dma_addr_t end = addr + size - 1;
 
 	if (!dev->dma_mask)
 		return false;
 
-	if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
+	if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
 	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
 		return false;
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a479bd2d1e8b..40f1f0aac4b1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -363,7 +363,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
 		size_t size)
 {
 	return swiotlb_force != SWIOTLB_FORCE &&
-		dma_capable(dev, dma_addr, size);
+		dma_capable(dev, dma_addr, size, true);
 }
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 {
 	dma_addr_t dma_addr = paddr;
 
-	if (unlikely(!dma_capable(dev, dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 673a2cdb2656..9280d6f8271e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 
 	/* Ensure that the address returned is DMA'ble */
 	*dma_addr = __phys_to_dma(dev, *phys);
-	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return false;
-- 
cgit v1.2.3


From a7ba70f1787f977f970cd116076c6fce4b9e01cc Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Thu, 21 Nov 2019 10:26:44 +0100
Subject: dma-mapping: treat dev->bus_dma_mask as a DMA limit

Using a mask to represent bus DMA constraints has a set of limitations.
The biggest one being it can only hold a power of two (minus one). The
DMA mapping code is already aware of this and treats dev->bus_dma_mask
as a limit. This quirk is already used by some architectures although
still rare.

With the introduction of the Raspberry Pi 4 we've found a new contender
for the use of bus DMA limits, as its PCIe bus can only address the
lower 3GB of memory (of a total of 4GB). This is impossible to represent
with a mask. To make things worse the device-tree code rounds non power
of two bus DMA limits to the next power of two, which is unacceptable in
this case.

In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all
over the tree and treat it as such. Note that dev->bus_dma_limit should
contain the higher accessible DMA address.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/mips/pci/fixup-sb1250.c  | 16 ++++++++--------
 arch/powerpc/sysdev/fsl_pci.c |  6 +++---
 arch/x86/kernel/pci-dma.c     |  2 +-
 arch/x86/mm/mem_encrypt.c     |  2 +-
 arch/x86/pci/sta2x11-fixup.c  |  2 +-
 drivers/acpi/arm64/iort.c     | 20 +++++++-------------
 drivers/ata/ahci.c            |  2 +-
 drivers/iommu/dma-iommu.c     |  3 +--
 drivers/of/device.c           |  9 +++++----
 include/linux/device.h        |  6 +++---
 include/linux/dma-direct.h    |  2 +-
 include/linux/dma-mapping.h   |  2 +-
 kernel/dma/direct.c           | 27 +++++++++++++--------------
 13 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/mips/pci/fixup-sb1250.c b/arch/mips/pci/fixup-sb1250.c
index 8a41b359cf90..40efc990cdce 100644
--- a/arch/mips/pci/fixup-sb1250.c
+++ b/arch/mips/pci/fixup-sb1250.c
@@ -21,22 +21,22 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 
 /*
  * The BCM1250, etc. PCI host bridge does not support DAC on its 32-bit
- * bus, so we set the bus's DMA mask accordingly.  However the HT link
+ * bus, so we set the bus's DMA limit accordingly.  However the HT link
  * down the artificial PCI-HT bridge supports 40-bit addressing and the
  * SP1011 HT-PCI bridge downstream supports both DAC and a 64-bit bus
  * width, so we record the PCI-HT bridge's secondary and subordinate bus
- * numbers and do not set the mask for devices present in the inclusive
+ * numbers and do not set the limit for devices present in the inclusive
  * range of those.
  */
-struct sb1250_bus_dma_mask_exclude {
+struct sb1250_bus_dma_limit_exclude {
 	bool set;
 	unsigned char start;
 	unsigned char end;
 };
 
-static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
+static int sb1250_bus_dma_limit(struct pci_dev *dev, void *data)
 {
-	struct sb1250_bus_dma_mask_exclude *exclude = data;
+	struct sb1250_bus_dma_limit_exclude *exclude = data;
 	bool exclude_this;
 	bool ht_bridge;
 
@@ -55,7 +55,7 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 			exclude->start, exclude->end);
 	} else {
 		dev_dbg(&dev->dev, "disabling DAC for device");
-		dev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+		dev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	}
 
 	return 0;
@@ -63,9 +63,9 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 
 static void quirk_sb1250_pci_dac(struct pci_dev *dev)
 {
-	struct sb1250_bus_dma_mask_exclude exclude = { .set = false };
+	struct sb1250_bus_dma_limit_exclude exclude = { .set = false };
 
-	pci_walk_bus(dev->bus, sb1250_bus_dma_mask, &exclude);
+	pci_walk_bus(dev->bus, sb1250_bus_dma_limit, &exclude);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 			quirk_sb1250_pci_dac);
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ff0e2b156cb5..617a443d673d 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -115,8 +115,8 @@ static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 
-	pdev->dev.bus_dma_mask =
-		hose->dma_window_base_cur + hose->dma_window_size;
+	pdev->dev.bus_dma_limit =
+		hose->dma_window_base_cur + hose->dma_window_size - 1;
 }
 
 static void setup_swiotlb_ops(struct pci_controller *hose)
@@ -135,7 +135,7 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 	 * mapping that allows addressing any RAM address from across PCI.
 	 */
 	if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
-		dev->bus_dma_mask = 0;
+		dev->bus_dma_limit = 0;
 		dev->archdata.dma_offset = pci64_dma_offset;
 	}
 }
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa4352dce491..3a75d665d43c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -146,7 +146,7 @@ rootfs_initcall(pci_iommu_init);
 
 static int via_no_dac_cb(struct pci_dev *pdev, void *data)
 {
-	pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+	pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	return 0;
 }
 
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9268c12458c8..a03614bd3e1a 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -367,7 +367,7 @@ bool force_dma_unencrypted(struct device *dev)
 	if (sme_active()) {
 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
-						dev->bus_dma_mask);
+						dev->bus_dma_limit);
 
 		if (dma_dev_mask <= dma_enc_mask)
 			return true;
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 4a631264b809..c313d784efab 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -143,7 +143,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev)
 
 	dev->dma_pfn_offset = PFN_DOWN(-amba_base);
 
-	dev->bus_dma_mask = max_amba_addr;
+	dev->bus_dma_limit = max_amba_addr;
 	pci_set_consistent_dma_mask(pdev, max_amba_addr);
 	pci_set_dma_mask(pdev, max_amba_addr);
 
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 5a7551d060f2..33f71983e001 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1057,8 +1057,8 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
  */
 void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 {
-	u64 mask, dmaaddr = 0, size = 0, offset = 0;
-	int ret, msb;
+	u64 end, mask, dmaaddr = 0, size = 0, offset = 0;
+	int ret;
 
 	/*
 	 * If @dev is expected to be DMA-capable then the bus code that created
@@ -1085,19 +1085,13 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 	}
 
 	if (!ret) {
-		msb = fls64(dmaaddr + size - 1);
 		/*
-		 * Round-up to the power-of-two mask or set
-		 * the mask to the whole 64-bit address space
-		 * in case the DMA region covers the full
-		 * memory window.
+		 * Limit coherent and dma mask based on size retrieved from
+		 * firmware.
 		 */
-		mask = msb == 64 ? U64_MAX : (1ULL << msb) - 1;
-		/*
-		 * Limit coherent and dma mask based on size
-		 * retrieved from firmware.
-		 */
-		dev->bus_dma_mask = mask;
+		end = dmaaddr + size - 1;
+		mask = DMA_BIT_MASK(ilog2(end) + 1);
+		dev->bus_dma_limit = end;
 		dev->coherent_dma_mask = mask;
 		*dev->dma_mask = mask;
 	}
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 05c2b32dcc4d..7c6d06ffb586 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -897,7 +897,7 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 	 * value, don't extend it here. This happens on STA2X11, for example.
 	 *
 	 * XXX: manipulating the DMA mask from platform code is completely
-	 * bogus, platform code should use dev->bus_dma_mask instead..
+	 * bogus, platform code should use dev->bus_dma_limit instead..
 	 */
 	if (pdev->dma_mask && pdev->dma_mask < DMA_BIT_MASK(32))
 		return 0;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0fa8c1d818b7..646332fbf3d7 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -405,8 +405,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
 		iova_len = roundup_pow_of_two(iova_len);
 
-	if (dev->bus_dma_mask)
-		dma_limit &= dev->bus_dma_mask;
+	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
 
 	if (domain->geometry.force_aperture)
 		dma_limit = min(dma_limit, domain->geometry.aperture_end);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index da8158392010..e9127db7b067 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -93,7 +93,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	bool coherent;
 	unsigned long offset;
 	const struct iommu_ops *iommu;
-	u64 mask;
+	u64 mask, end;
 
 	ret = of_dma_get_range(np, &dma_addr, &paddr, &size);
 	if (ret < 0) {
@@ -148,12 +148,13 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	 * Limit coherent and dma mask based on size and default mask
 	 * set by the driver.
 	 */
-	mask = DMA_BIT_MASK(ilog2(dma_addr + size - 1) + 1);
+	end = dma_addr + size - 1;
+	mask = DMA_BIT_MASK(ilog2(end) + 1);
 	dev->coherent_dma_mask &= mask;
 	*dev->dma_mask &= mask;
-	/* ...but only set bus mask if we found valid dma-ranges earlier */
+	/* ...but only set bus limit if we found valid dma-ranges earlier */
 	if (!ret)
-		dev->bus_dma_mask = mask;
+		dev->bus_dma_limit = end;
 
 	coherent = of_dma_is_coherent(np);
 	dev_dbg(dev, "device is%sdma coherent\n",
diff --git a/include/linux/device.h b/include/linux/device.h
index 297239a08bb7..e396de656f20 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1186,8 +1186,8 @@ struct dev_links_info {
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
  * 		hardware supports 64-bit addresses for consistent allocations
  * 		such descriptors.
- * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA
- *		limit than the device itself supports.
+ * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
+ *		DMA limit than the device itself supports.
  * @dma_pfn_offset: offset of DMA memory range relatively of RAM
  * @dma_parms:	A low level driver may set these to teach IOMMU code about
  * 		segment limitations.
@@ -1270,7 +1270,7 @@ struct device {
 					     not all hardware supports
 					     64 bit addresses for consistent
 					     allocations such descriptors. */
-	u64		bus_dma_mask;	/* upstream dma_mask constraint */
+	u64		bus_dma_limit;	/* upstream dma constraint */
 	unsigned long	dma_pfn_offset;
 
 	struct device_dma_parameters *dma_parms;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 452f5280cde3..24b8684aa21d 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -63,7 +63,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
 	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
 		return false;
 
-	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
+	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4d450672b7d6..c4d8741264bd 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -697,7 +697,7 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
  */
 static inline bool dma_addressing_limited(struct device *dev)
 {
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) <
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
 			    dma_get_required_mask(dev);
 }
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 267b23a13b69..6af7ae83c4ad 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -27,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
 		dev_err_once(dev, "DMA map on device without dma_mask\n");
-	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
 		dev_err_once(dev,
-			"overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
-			&dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
+			"overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
 	}
 	WARN_ON_ONCE(1);
 }
@@ -57,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev)
 }
 
 static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-		u64 *phys_mask)
+		u64 *phys_limit)
 {
-	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
-		dma_mask = dev->bus_dma_mask;
+	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
 	if (force_dma_unencrypted(dev))
-		*phys_mask = __dma_to_phys(dev, dma_mask);
+		*phys_limit = __dma_to_phys(dev, dma_limit);
 	else
-		*phys_mask = dma_to_phys(dev, dma_mask);
+		*phys_limit = dma_to_phys(dev, dma_limit);
 
 	/*
 	 * Optimistically try the zone that the physical address mask falls
@@ -75,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	 * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
 	 * zones.
 	 */
-	if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits))
+	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
 		return GFP_DMA;
-	if (*phys_mask <= DMA_BIT_MASK(32))
+	if (*phys_limit <= DMA_BIT_MASK(32))
 		return GFP_DMA32;
 	return 0;
 }
@@ -85,7 +84,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
 	return phys_to_dma_direct(dev, phys) + size - 1 <=
-			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -94,7 +93,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
-	u64 phys_mask;
+	u64 phys_limit;
 
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
@@ -102,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	/* we always manually zero the memory once we are done: */
 	gfp &= ~__GFP_ZERO;
 	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-			&phys_mask);
+			&phys_limit);
 	page = dma_alloc_contiguous(dev, alloc_size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, alloc_size);
@@ -116,7 +115,7 @@ again:
 		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
-		    phys_mask < DMA_BIT_MASK(64) &&
+		    phys_limit < DMA_BIT_MASK(64) &&
 		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
 			gfp |= GFP_DMA32;
 			goto again;
-- 
cgit v1.2.3


From 93efbde2c331004d8053f04b4bf0ca3e630b474a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 20 Nov 2019 22:12:38 -0800
Subject: x86/traps: Disentangle the 32-bit and 64-bit doublefault code

The 64-bit doublefault handler is much nicer than the 32-bit one.
As a first step toward unifying them, make the 64-bit handler
self-contained.  This should have no effect no functional effect
except in the odd case of x86_64 with CONFIG_DOUBLEFAULT=n in which
case it will change the logging a bit.

This also gets rid of CONFIG_DOUBLEFAULT configurability on 64-bit
kernels.  It didn't do anything useful -- CONFIG_DOUBLEFAULT=n
didn't actually disable doublefault handling on x86_64.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug           |  2 +-
 arch/x86/include/asm/processor.h |  1 -
 arch/x86/kernel/doublefault.c    | 11 -----------
 arch/x86/kernel/traps.c          | 12 +++---------
 4 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 409c00f74e60..c4eab8ed33a3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,7 +117,7 @@ config DEBUG_WX
 
 config DOUBLEFAULT
 	default y
-	bool "Enable doublefault exception handler" if EXPERT
+	bool "Enable doublefault exception handler" if EXPERT && X86_32
 	---help---
 	  This option allows trapping of rare doublefault exceptions that
 	  would otherwise cause a system to silently reboot. Disabling this
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e51afbb0cbfb..f6c630097d9f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -997,7 +997,6 @@ bool xen_set_default_idle(void);
 #endif
 
 void stop_this_cpu(void *dummy);
-void df_debug(struct pt_regs *regs, long error_code);
 void microcode_check(void);
 
 enum l1tf_mitigations {
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0d6c657593f8..0b3c616b61a3 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -72,15 +72,4 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
 	.__cr3		= __pa_nodebug(swapper_pg_dir),
 };
 
-/* dummy for do_double_fault() call */
-void df_debug(struct pt_regs *regs, long error_code) {}
-
-#else /* !CONFIG_X86_32 */
-
-void df_debug(struct pt_regs *regs, long error_code)
-{
-	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
-	show_regs(regs);
-	panic("Machine halted.");
-}
 #endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..76381b04dc93 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -411,15 +411,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
 		handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 #endif
 
-#ifdef CONFIG_DOUBLEFAULT
-	df_debug(regs, error_code);
-#endif
-	/*
-	 * This is always a kernel trap and never fixable (and thus must
-	 * never return).
-	 */
-	for (;;)
-		die(str, regs, error_code);
+	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+	show_regs(regs);
+	panic("Machine halted.");
 }
 #endif
 
-- 
cgit v1.2.3


From e99b6f46ee5c127d39d2f3a2682fdeef10386316 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 21 Nov 2019 09:42:30 -0800
Subject: x86/doublefault/32: Rename doublefault.c to doublefault_32.c

doublefault.c now only contains 32-bit code.  Rename it to
doublefault_32.c.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/Makefile         |  4 ++-
 arch/x86/kernel/doublefault.c    | 75 ----------------------------------------
 arch/x86/kernel/doublefault_32.c | 71 +++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 76 deletions(-)
 delete mode 100644 arch/x86/kernel/doublefault.c
 create mode 100644 arch/x86/kernel/doublefault_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 32acb970f416..6175e370ee4a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-y				+= kprobes/
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_DOUBLEFAULT)	+= doublefault.o
+ifeq ($(CONFIG_X86_32),y)
+obj-$(CONFIG_DOUBLEFAULT)	+= doublefault_32.o
+endif
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
deleted file mode 100644
index 0b3c616b61a3..000000000000
--- a/arch/x86/kernel/doublefault.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-
-#ifdef CONFIG_X86_32
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-
-static void doublefault_fn(void)
-{
-	struct desc_ptr gdt_desc = {0, 0};
-	unsigned long gdt, tss;
-
-	native_store_gdt(&gdt_desc);
-	gdt = gdt_desc.address;
-
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
-
-	if (ptr_ok(gdt)) {
-		gdt += GDT_ENTRY_TSS << 3;
-		tss = get_desc_base((struct desc_struct *)gdt);
-		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
-
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
-
-			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
-			       t->ip, t->sp);
-
-			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
-		}
-	}
-
-	for (;;)
-		cpu_relax();
-}
-
-struct x86_hw_tss doublefault_tss __cacheline_aligned = {
-	.sp0		= STACK_START,
-	.ss0		= __KERNEL_DS,
-	.ldt		= 0,
-	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
-
-	.ip		= (unsigned long) doublefault_fn,
-	/* 0x2 bit is always set */
-	.flags		= X86_EFLAGS_SF | 0x2,
-	.sp		= STACK_START,
-	.es		= __USER_DS,
-	.cs		= __KERNEL_CS,
-	.ss		= __KERNEL_DS,
-	.ds		= __USER_DS,
-	.fs		= __KERNEL_PERCPU,
-#ifndef CONFIG_X86_32_LAZY_GS
-	.gs		= __KERNEL_STACK_CANARY,
-#endif
-
-	.__cr3		= __pa_nodebug(swapper_pg_dir),
-};
-
-#endif
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..61c707ca8a09
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+static void doublefault_fn(void)
+{
+	struct desc_ptr gdt_desc = {0, 0};
+	unsigned long gdt, tss;
+
+	native_store_gdt(&gdt_desc);
+	gdt = gdt_desc.address;
+
+	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+	if (ptr_ok(gdt)) {
+		gdt += GDT_ENTRY_TSS << 3;
+		tss = get_desc_base((struct desc_struct *)gdt);
+		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+
+		if (ptr_ok(tss)) {
+			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+
+			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
+			       t->ip, t->sp);
+
+			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+				t->ax, t->bx, t->cx, t->dx);
+			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+				t->si, t->di);
+		}
+	}
+
+	for (;;)
+		cpu_relax();
+}
+
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+	.sp0		= STACK_START,
+	.ss0		= __KERNEL_DS,
+	.ldt		= 0,
+	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
+
+	.ip		= (unsigned long) doublefault_fn,
+	/* 0x2 bit is always set */
+	.flags		= X86_EFLAGS_SF | 0x2,
+	.sp		= STACK_START,
+	.es		= __USER_DS,
+	.cs		= __KERNEL_CS,
+	.ss		= __KERNEL_DS,
+	.ds		= __USER_DS,
+	.fs		= __KERNEL_PERCPU,
+#ifndef CONFIG_X86_32_LAZY_GS
+	.gs		= __KERNEL_STACK_CANARY,
+#endif
+
+	.__cr3		= __pa_nodebug(swapper_pg_dir),
+};
-- 
cgit v1.2.3


From dc4e0021b00b5a4ecba56fae509217776592b0aa Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Nov 2019 18:27:16 +0100
Subject: x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area

There are three problems with the current layout of the doublefault
stack and TSS.  First, the TSS is only cacheline-aligned, which is
not enough -- if the hardware portion of the TSS (struct x86_hw_tss)
crosses a page boundary, horrible things happen [0].  Second, the
stack and TSS are global, so simultaneous double faults on different
CPUs will cause massive corruption.  Third, the whole mechanism
won't work if user CR3 is loaded, resulting in a triple fault [1].

Let the doublefault stack and TSS share a page (which prevents the
TSS from spanning a page boundary), make it percpu, and move it into
cpu_entry_area.  Teach the stack dump code about the doublefault
stack.

[0] Real hardware will read past the end of the page onto the next
    *physical* page if a task switch happens.  Virtual machines may
    have any number of bugs, and I would consider it reasonable for
    a VM to summarily kill the guest if it tries to task-switch to
    a page-spanning TSS.

[1] Real hardware triple faults.  At least some VMs seem to hang.
    I'm not sure what's going on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpu_entry_area.h   | 12 +++++++
 arch/x86/include/asm/doublefault.h      | 13 ++++++++
 arch/x86/include/asm/pgtable_32_types.h |  7 ++--
 arch/x86/include/asm/processor.h        |  1 -
 arch/x86/kernel/cpu/common.c            | 12 ++-----
 arch/x86/kernel/doublefault_32.c        | 58 ++++++++++++++++++++++-----------
 arch/x86/kernel/dumpstack_32.c          | 30 +++++++++++++++++
 arch/x86/mm/cpu_entry_area.c            | 14 +++++++-
 8 files changed, 113 insertions(+), 34 deletions(-)
 create mode 100644 arch/x86/include/asm/doublefault.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index ea866c7bf31d..804734058c77 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -65,6 +65,13 @@ enum exception_stack_ordering {
 
 #endif
 
+#ifdef CONFIG_X86_32
+struct doublefault_stack {
+	unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
+	struct x86_hw_tss tss;
+} __aligned(PAGE_SIZE);
+#endif
+
 /*
  * cpu_entry_area is a percpu region that contains things needed by the CPU
  * and early entry/exit code.  Real types aren't used for all fields here
@@ -86,6 +93,11 @@ struct cpu_entry_area {
 #endif
 	struct entry_stack_page entry_stack_page;
 
+#ifdef CONFIG_X86_32
+	char guard_doublefault_stack[PAGE_SIZE];
+	struct doublefault_stack doublefault_stack;
+#endif
+
 	/*
 	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
 	 * we need task switches to work, and task switches write to the TSS.
diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h
new file mode 100644
index 000000000000..af9a14ac8962
--- /dev/null
+++ b/arch/x86/include/asm/doublefault.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DOUBLEFAULT_H
+#define _ASM_X86_DOUBLEFAULT_H
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
+extern void doublefault_init_cpu_tss(void);
+#else
+static inline void doublefault_init_cpu_tss(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_DOUBLEFAULT_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 19f5807260c3..0416d42e5bdd 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 #endif
 
 /*
- * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
- * to avoid include recursion hell
+ * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
+ * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
+ * to avoid include recursion hell.
  */
-#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 41)
+#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 43)
 
 /* The +1 is for the readonly IDT page: */
 #define CPU_ENTRY_AREA_BASE	\
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f6c630097d9f..0340aad3f2fc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -166,7 +166,6 @@ enum cpuid_regs_idx {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct x86_hw_tss	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS + NBUGINTS];
 extern __u32			cpu_caps_set[NCAPINTS + NBUGINTS];
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index baa2fed8deb6..2e4d90294fe6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -24,6 +24,7 @@
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
 #include <asm/mmu_context.h>
+#include <asm/doublefault.h>
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
 	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
 }
 
-static inline void gdt_setup_doublefault_tss(int cpu) { }
-
 #else /* CONFIG_X86_64 */
 
 static inline void setup_getcpu(int cpu) { }
@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
 
 static inline void tss_setup_ist(struct tss_struct *tss) { }
 
-static inline void gdt_setup_doublefault_tss(int cpu)
-{
-#ifdef CONFIG_DOUBLEFAULT
-	/* Set up the doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-}
 #endif /* !CONFIG_X86_64 */
 
 static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -1923,7 +1915,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	gdt_setup_doublefault_tss(cpu);
+	doublefault_init_cpu_tss();
 
 	fpu__init_cpu();
 
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 61c707ca8a09..4eecfe4825ed 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -10,10 +10,6 @@
 #include <asm/processor.h>
 #include <asm/desc.h>
 
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
 #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
 
 static void doublefault_fn(void)
@@ -21,6 +17,8 @@ static void doublefault_fn(void)
 	struct desc_ptr gdt_desc = {0, 0};
 	unsigned long gdt, tss;
 
+	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+
 	native_store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
@@ -48,24 +46,46 @@ static void doublefault_fn(void)
 		cpu_relax();
 }
 
-struct x86_hw_tss doublefault_tss __cacheline_aligned = {
-	.sp0		= STACK_START,
-	.ss0		= __KERNEL_DS,
-	.ldt		= 0,
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+	.tss = {
+                /*
+                 * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+                 * active.  sp is filled in later.
+                 */
+		.ldt		= 0,
 	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 
-	.ip		= (unsigned long) doublefault_fn,
-	/* 0x2 bit is always set */
-	.flags		= X86_EFLAGS_SF | 0x2,
-	.sp		= STACK_START,
-	.es		= __USER_DS,
-	.cs		= __KERNEL_CS,
-	.ss		= __KERNEL_DS,
-	.ds		= __USER_DS,
-	.fs		= __KERNEL_PERCPU,
+		.ip		= (unsigned long) doublefault_fn,
+		/* 0x2 bit is always set */
+		.flags		= X86_EFLAGS_SF | 0x2,
+		.es		= __USER_DS,
+		.cs		= __KERNEL_CS,
+		.ss		= __KERNEL_DS,
+		.ds		= __USER_DS,
+		.fs		= __KERNEL_PERCPU,
 #ifndef CONFIG_X86_32_LAZY_GS
-	.gs		= __KERNEL_STACK_CANARY,
+		.gs		= __KERNEL_STACK_CANARY,
 #endif
 
-	.__cr3		= __pa_nodebug(swapper_pg_dir),
+		.__cr3		= __pa_nodebug(swapper_pg_dir),
+	},
 };
+
+void doublefault_init_cpu_tss(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+	/*
+	 * The linker isn't smart enough to initialize percpu variables that
+	 * point to other places in percpu space.
+	 */
+        this_cpu_write(doublefault_stack.tss.sp,
+                       (unsigned long)&cea->doublefault_stack.stack +
+                       sizeof(doublefault_stack.stack));
+
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 64a59d726639..8e3a8fedfa4d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_ENTRY)
 		return "ENTRY_TRAMPOLINE";
 
+	if (type == STACK_TYPE_EXCEPTION)
+		return "#DF";
+
 	return NULL;
 }
 
@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
 	return true;
 }
 
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
+{
+#ifdef CONFIG_DOUBLEFAULT
+	struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+	struct doublefault_stack *ss = &cea->doublefault_stack;
+
+	void *begin = ss->stack;
+	void *end = begin + sizeof(ss->stack);
+
+	if ((void *)stack < begin || (void *)stack >= end)
+		return false;
+
+	info->type	= STACK_TYPE_EXCEPTION;
+	info->begin	= begin;
+	info->end	= end;
+	info->next_sp	= (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+	return true;
+#else
+	return false;
+#endif
+}
+
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info, unsigned long *visit_mask)
 {
@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (in_softirq_stack(stack, info))
 		goto recursion_check;
 
+	if (in_doublefault_stack(stack, info))
+		goto recursion_check;
+
 	goto unknown;
 
 recursion_check:
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 82ead8e27888..56f9189bbadb 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
 DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
 #endif
 
+#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
+DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
+#endif
+
 struct cpu_entry_area *get_cpu_entry_area(int cpu)
 {
 	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
 	cea_map_stack(MCE);
 }
 #else
-static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
+static inline void percpu_setup_exception_stacks(unsigned int cpu)
+{
+#ifdef CONFIG_DOUBLEFAULT
+	struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+	cea_map_percpu_pages(&cea->doublefault_stack,
+			     &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
+#endif
+}
 #endif
 
 /* Setup the fixmap mappings only once per-processor */
-- 
cgit v1.2.3


From 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 20 Nov 2019 23:06:41 -0800
Subject: x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with
 64-bit

The old x86_32 doublefault_fn() was old and crufty, and it did not
even try to recover.  do_double_fault() is much nicer.  Rewrite the
32-bit double fault code to sanitize CPU state and call
do_double_fault().  This is mostly an exercise i386 archaeology.

With this patch applied, 32-bit double faults get a real stack trace,
just like 64-bit double faults.

[ mingo: merged the patch to a later kernel base. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S        |  42 +++++++++++++++
 arch/x86/include/asm/traps.h     |   3 ++
 arch/x86/kernel/doublefault_32.c | 107 +++++++++++++++++++++++++++------------
 arch/x86/kernel/traps.c          |  19 ++++++-
 4 files changed, 138 insertions(+), 33 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 5832b11f01bb..632432bb723d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1537,6 +1537,48 @@ SYM_CODE_START(debug)
 	jmp	common_exception
 SYM_CODE_END(debug)
 
+#ifdef CONFIG_DOUBLEFAULT
+SYM_CODE_START(double_fault)
+1:
+	/*
+	 * This is a task gate handler, not an interrupt gate handler.
+	 * The error code is on the stack, but the stack is otherwise
+	 * empty.  Interrupts are off.  Our state is sane with the following
+	 * exceptions:
+	 *
+	 *  - CR0.TS is set.  "TS" literally means "task switched".
+	 *  - EFLAGS.NT is set because we're a "nested task".
+	 *  - The doublefault TSS has back_link set and has been marked busy.
+	 *  - TR points to the doublefault TSS and the normal TSS is busy.
+	 *  - CR3 is the normal kernel PGD.  This would be delightful, except
+	 *    that the CPU didn't bother to save the old CR3 anywhere.  This
+	 *    would make it very awkward to return back to the context we came
+	 *    from.
+	 *
+	 * The rest of EFLAGS is sanitized for us, so we don't need to
+	 * worry about AC or DF.
+	 *
+	 * Don't even bother popping the error code.  It's always zero,
+	 * and ignoring it makes us a bit more robust against buggy
+	 * hypervisor task gate implementations.
+	 *
+	 * We will manually undo the task switch instead of doing a
+	 * task-switching IRET.
+	 */
+
+	clts				/* clear CR0.TS */
+	pushl	$X86_EFLAGS_FIXED
+	popfl				/* clear EFLAGS.NT */
+
+	call	doublefault_shim
+
+	/* We don't support returning, so we have no IRET here. */
+1:
+	hlt
+	jmp 1b
+SYM_CODE_END(double_fault)
+#endif
+
 /*
  * NMI is doubly nasty.  It can happen on the first instruction of
  * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b25e633033c3..ffa0dc8a535e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
 dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
 dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
+#endif
 dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
 dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
 dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 4eecfe4825ed..3793646f0fb5 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -9,42 +9,83 @@
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/traps.h>
 
+extern void double_fault(void);
 #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
 
-static void doublefault_fn(void)
-{
-	struct desc_ptr gdt_desc = {0, 0};
-	unsigned long gdt, tss;
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
 
-	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+static void set_df_gdt_entry(unsigned int cpu);
 
-	native_store_gdt(&gdt_desc);
-	gdt = gdt_desc.address;
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared.  The CPU thinks
+ * we're running the doublefault task.  Cannot return.
+ */
+asmlinkage notrace void __noreturn doublefault_shim(void)
+{
+	unsigned long cr2;
+	struct pt_regs regs;
 
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
 
-	if (ptr_ok(gdt)) {
-		gdt += GDT_ENTRY_TSS << 3;
-		tss = get_desc_base((struct desc_struct *)gdt);
-		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+	cr2 = native_read_cr2();
 
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+	/* Reset back to the normal kernel task. */
+	force_reload_TR();
+	set_df_gdt_entry(smp_processor_id());
 
-			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
-			       t->ip, t->sp);
+	trace_hardirqs_off();
 
-			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
-		}
-	}
+	/*
+	 * Fill in pt_regs.  A downside of doing this in C is that the unwinder
+	 * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+	 * won't successfully unwind to the source of the double fault.
+	 * The main dump from do_double_fault() is fine, though, since it
+	 * uses these regs directly.
+	 *
+	 * If anyone ever cares, this could be moved to asm.
+	 */
+	regs.ss		= TSS(ss);
+	regs.__ssh	= 0;
+	regs.sp		= TSS(sp);
+	regs.flags	= TSS(flags);
+	regs.cs		= TSS(cs);
+	/* We won't go through the entry asm, so we can leave __csh as 0. */
+	regs.__csh	= 0;
+	regs.ip		= TSS(ip);
+	regs.orig_ax	= 0;
+	regs.gs		= TSS(gs);
+	regs.__gsh	= 0;
+	regs.fs		= TSS(fs);
+	regs.__fsh	= 0;
+	regs.es		= TSS(es);
+	regs.__esh	= 0;
+	regs.ds		= TSS(ds);
+	regs.__dsh	= 0;
+	regs.ax		= TSS(ax);
+	regs.bp		= TSS(bp);
+	regs.di		= TSS(di);
+	regs.si		= TSS(si);
+	regs.dx		= TSS(dx);
+	regs.cx		= TSS(cx);
+	regs.bx		= TSS(bx);
+
+	do_double_fault(&regs, 0, cr2);
 
-	for (;;)
-		cpu_relax();
+	/*
+	 * x86_32 does not save the original CR3 anywhere on a task switch.
+	 * This means that, even if we wanted to return, we would need to find
+	 * some way to reconstruct CR3.  We could make a credible guess based
+	 * on cpu_tlbstate, but that would be racy and would not account for
+	 * PTI.
+	 *
+	 * Instead, don't bother.  We can return through
+	 * rewind_stack_do_exit() instead.
+	 */
+	panic("cannot return from double fault\n");
 }
+NOKPROBE_SYMBOL(doublefault_shim);
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 	.tss = {
@@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 		.ldt		= 0,
 	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
+		.ip		= (unsigned long) double_fault,
+		.flags		= X86_EFLAGS_FIXED,
 		.es		= __USER_DS,
 		.cs		= __KERNEL_CS,
 		.ss		= __KERNEL_DS,
@@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
 	},
 };
 
+static void set_df_gdt_entry(unsigned int cpu)
+{
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
 void doublefault_init_cpu_tss(void)
 {
 	unsigned int cpu = smp_processor_id();
@@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void)
                        (unsigned long)&cea->doublefault_stack.stack +
                        sizeof(doublefault_stack.stack));
 
-	/* Set up doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
-		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
-
+	set_df_gdt_entry(cpu);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 76381b04dc93..a9b16c3a933d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
 }
 #endif
 
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry.  Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected.  Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS.  Returning is, in principle, okay, but changes to regs will
+ * be lost.  If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
 {
 	static const char str[] = "double fault";
-- 
cgit v1.2.3


From 0337b7ebfcb8efb4ea0a9f2b2f284217a1c0e62d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 25 Nov 2019 22:37:44 -0800
Subject: x86/traps: die() instead of panicking on a double fault

A double fault has a decent chance of being recoverable by killing
the offending thread.  Use die() so that we at least try to recover.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a9b16c3a933d..05da6b5b167b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -427,7 +427,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
 #endif
 
 	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
-	show_regs(regs);
+	die("double fault", regs, error_code);
 	panic("Machine halted.");
 }
 #endif
-- 
cgit v1.2.3


From 8e05f1b4f27d07a0f93e7c6fd28525a5d082b85c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 15 Jul 2019 10:08:48 -0700
Subject: x86/ptrace: Remove set_segment_reg() implementations for current

seg_segment_reg() should be unreachable with task == current.
Rather than confusingly trying to make it work, just explicitly
disable this case.

(regset->get is used for current in the coredump code, but the ->set
 interface is only used for ptrace, and you can't ptrace yourself.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ptrace.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 066e5b01a7e0..3b3b16932589 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
 static int set_segment_reg(struct task_struct *task,
 			   unsigned long offset, u16 value)
 {
+	if (WARN_ON_ONCE(task == current))
+		return -EIO;
+
 	/*
 	 * The value argument was already truncated to 16 bits.
 	 */
@@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task,
 		break;
 
 	case offsetof(struct user_regs_struct, gs):
-		if (task == current)
-			set_user_gs(task_pt_regs(task), value);
-		else
-			task_user_gs(task) = value;
+		task_user_gs(task) = value;
 	}
 
 	return 0;
@@ -272,6 +272,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
 static int set_segment_reg(struct task_struct *task,
 			   unsigned long offset, u16 value)
 {
+	if (WARN_ON_ONCE(task == current))
+		return -EIO;
+
 	/*
 	 * The value argument was already truncated to 16 bits.
 	 */
@@ -281,23 +284,15 @@ static int set_segment_reg(struct task_struct *task,
 	switch (offset) {
 	case offsetof(struct user_regs_struct,fs):
 		task->thread.fsindex = value;
-		if (task == current)
-			loadsegment(fs, task->thread.fsindex);
 		break;
 	case offsetof(struct user_regs_struct,gs):
 		task->thread.gsindex = value;
-		if (task == current)
-			load_gs_index(task->thread.gsindex);
 		break;
 	case offsetof(struct user_regs_struct,ds):
 		task->thread.ds = value;
-		if (task == current)
-			loadsegment(ds, task->thread.ds);
 		break;
 	case offsetof(struct user_regs_struct,es):
 		task->thread.es = value;
-		if (task == current)
-			loadsegment(es, task->thread.es);
 		break;
 
 		/*
-- 
cgit v1.2.3


From 56f2ab41b652251f336a0f471b1033afeaedd161 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 17 Jul 2019 06:44:16 -0700
Subject: x86/ptrace: Document FSBASE and GSBASE ABI oddities

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ptrace.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3b3b16932589..f0e1ddbc2fd7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -281,6 +281,20 @@ static int set_segment_reg(struct task_struct *task,
 	if (invalid_selector(value))
 		return -EIO;
 
+	/*
+	 * This function has some ABI oddities.
+	 *
+	 * A 32-bit ptracer probably expects that writing FS or GS will change
+	 * FSBASE or GSBASE respectively.  In the absence of FSGSBASE support,
+	 * this code indeed has that effect.  When FSGSBASE is added, this
+	 * will require a special case.
+	 *
+	 * For existing 64-bit ptracers, writing FS or GS *also* currently
+	 * changes the base if the selector is nonzero the next time the task
+	 * is run.  This behavior may not be needed, and trying to preserve it
+	 * when FSGSBASE is added would be complicated at best.
+	 */
+
 	switch (offset) {
 	case offsetof(struct user_regs_struct,fs):
 		task->thread.fsindex = value;
@@ -370,6 +384,9 @@ static int putreg(struct task_struct *child,
 		 * When changing the FS base, use do_arch_prctl_64()
 		 * to set the index to zero and to set the base
 		 * as requested.
+		 *
+		 * NB: This behavior is nonsensical and likely needs to
+		 * change when FSGSBASE support is added.
 		 */
 		if (child->thread.fsbase != value)
 			return do_arch_prctl_64(child, ARCH_SET_FS, value);
-- 
cgit v1.2.3


From 5a43b87b3c62ad149ba6e9d0d3e5c0e5da02a5ca Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 28 Nov 2019 07:08:24 -0800
Subject: x86/mce/therm_throt: Mask out read-only and reserved MSR bits

While writing to MSR IA32_THERM_STATUS/IA32_PKG_THERM_STATUS, avoid
writing 1 to read only and reserved fields because updating some fields
generates exception.

 [ bp: Vertically align for better readability. ]

Fixes: f6656208f04e ("x86/mce/therm_throt: Optimize notifications of thermal throttle")
Reported-by: Dominik Brodowski <linux@dominikbrodowski.net>
Tested-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20191128150824.22413-1-srinivas.pandruvada@linux.intel.com
---
 arch/x86/kernel/cpu/mce/therm_throt.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index d01e0da0163a..b38010b541d6 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -195,17 +195,24 @@ static const struct attribute_group thermal_attr_group = {
 #define THERM_THROT_POLL_INTERVAL	HZ
 #define THERM_STATUS_PROCHOT_LOG	BIT(1)
 
+#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
+#define THERM_STATUS_CLEAR_PKG_MASK  (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+
 static void clear_therm_status_log(int level)
 {
 	int msr;
-	u64 msr_val;
+	u64 mask, msr_val;
 
-	if (level == CORE_LEVEL)
-		msr = MSR_IA32_THERM_STATUS;
-	else
-		msr = MSR_IA32_PACKAGE_THERM_STATUS;
+	if (level == CORE_LEVEL) {
+		msr  = MSR_IA32_THERM_STATUS;
+		mask = THERM_STATUS_CLEAR_CORE_MASK;
+	} else {
+		msr  = MSR_IA32_PACKAGE_THERM_STATUS;
+		mask = THERM_STATUS_CLEAR_PKG_MASK;
+	}
 
 	rdmsrl(msr, msr_val);
+	msr_val &= mask;
 	wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
 }
 
-- 
cgit v1.2.3


From 7b0b8cfd261c569177d64d6e9b1800fbe412fd65 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 30 Nov 2019 16:00:53 +0100
Subject: x86/ioperm: Save an indentation level in tss_update_io_bitmap()

... for better readability.

No functional changes.

[ Minor edit. ]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 52 +++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bd2a11ca5dd6..61e93a318983 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
 void tss_update_io_bitmap(void)
 {
 	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+	struct thread_struct *t = &current->thread;
 	u16 *base = &tss->x86_tss.io_bitmap_base;
 
-	if (test_thread_flag(TIF_IO_BITMAP)) {
-		struct thread_struct *t = &current->thread;
-
-		if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
-			*base = IO_BITMAP_OFFSET_VALID_ALL;
-		} else {
-			struct io_bitmap *iobm = t->io_bitmap;
-			/*
-			 * Only copy bitmap data when the sequence number
-			 * differs. The update time is accounted to the
-			 * incoming task.
-			 */
-			if (tss->io_bitmap.prev_sequence != iobm->sequence)
-				tss_copy_io_bitmap(tss, iobm);
-
-			/* Enable the bitmap */
-			*base = IO_BITMAP_OFFSET_VALID_MAP;
-		}
+	if (!test_thread_flag(TIF_IO_BITMAP)) {
+		tss_invalidate_io_bitmap(tss);
+		return;
+	}
+
+	if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+		*base = IO_BITMAP_OFFSET_VALID_ALL;
+	} else {
+		struct io_bitmap *iobm = t->io_bitmap;
+
 		/*
-		 * Make sure that the TSS limit is covering the io bitmap.
-		 * It might have been cut down by a VMEXIT to 0x67 which
-		 * would cause a subsequent I/O access from user space to
-		 * trigger a #GP because tbe bitmap is outside the TSS
-		 * limit.
+		 * Only copy bitmap data when the sequence number differs. The
+		 * update time is accounted to the incoming task.
 		 */
-		refresh_tss_limit();
-	} else {
-		tss_invalidate_io_bitmap(tss);
+		if (tss->io_bitmap.prev_sequence != iobm->sequence)
+			tss_copy_io_bitmap(tss, iobm);
+
+		/* Enable the bitmap */
+		*base = IO_BITMAP_OFFSET_VALID_MAP;
 	}
+
+	/*
+	 * Make sure that the TSS limit is covering the IO bitmap. It might have
+	 * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
+	 * access from user space to trigger a #GP because tbe bitmap is outside
+	 * the TSS limit.
+	 */
+	refresh_tss_limit();
 }
 #else /* CONFIG_X86_IOPL_IOPERM */
 static inline void switch_to_bitmap(unsigned long tifp) { }
-- 
cgit v1.2.3