summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 15:56:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 15:56:41 -0700
commit8f147727030bf9e81331ab9b8f42d4611bb6a3d9 (patch)
treed3f1e2410174bb8c479590a8f1c7e204e3a48eaf /arch/x86/kernel
parent53f8b081c184328b82c8a7b5e70b8243b3cea8bd (diff)
parent2c4645439e8f2f6e7c37f158feae6f6a82baa910 (diff)
downloadlinux-8f147727030bf9e81331ab9b8f42d4611bb6a3d9.tar.bz2
Merge branch 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 irq updates from Ingo Molnar: "Here are the main changes in this tree: - Introduce x86-64 IRQ/exception/debug stack guard pages to detect stack overflows immediately and deterministically. - Clean up over a decade worth of cruft accumulated. The outcome of this should be more clear-cut faults/crashes when any of the low level x86 CPU stacks overflow, instead of silent memory corruption and sporadic failures much later on" * 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) x86/irq: Fix outdated comments x86/irq/64: Remove stack overflow debug code x86/irq/64: Remap the IRQ stack with guard pages x86/irq/64: Split the IRQ stack into its own pages x86/irq/64: Init hardirq_stack_ptr during CPU hotplug x86/irq/32: Handle irq stack allocation failure proper x86/irq/32: Invoke irq_ctx_init() from init_IRQ() x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr x86/irq/32: Rename hard/softirq_stack to hard/softirq_stack_ptr x86/irq/32: Make irq stack a character array x86/irq/32: Define IRQ_STACK_SIZE x86/dumpstack/64: Speedup in_exception_stack() x86/exceptions: Split debug IST stack x86/exceptions: Enable IST guard pages x86/exceptions: Disconnect IST index and stack order x86/cpu: Remove orig_ist array x86/cpu: Prepare TSS.IST setup for guard pages x86/dumpstack/64: Use cpu_entry_area instead of orig_ist x86/irq/64: Use cpu entry area instead of orig_ist x86/traps: Use cpu_entry_area instead of orig_ist ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c60
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c99
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/idt.c19
-rw-r--r--arch/x86/kernel/irq_32.c41
-rw-r--r--arch/x86/kernel/irq_64.c89
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/nmi.c20
-rw-r--r--arch/x86/kernel/setup_percpu.c5
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/vmlinux.lds.S7
13 files changed, 199 insertions, 174 deletions
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index ddced33184b5..d3d075226c0a 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -68,10 +68,12 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
+ offsetof(struct cea_exception_stacks, DB1_stack));
BLANK();
#ifdef CONFIG_STACKPROTECTOR
- DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary));
BLANK();
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 95a5faf3a6a0..37f7d438a6ef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -507,19 +507,6 @@ void load_percpu_segment(int cpu)
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
-#ifdef CONFIG_X86_64
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
-#endif
-
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
{
@@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg)
__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
-DEFINE_PER_CPU_FIRST(union irq_stack_union,
- irq_stack_union) __aligned(PAGE_SIZE) __visible;
-EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
+DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
+ fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
+EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
/*
* The following percpu variables are hot. Align current_task to
@@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
-
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
@@ -1562,23 +1547,7 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
DEFINE_PER_CPU(int, debug_stack_usage);
-
-int is_debug_stack(unsigned long addr)
-{
- return __this_cpu_read(debug_stack_usage) ||
- (addr <= __this_cpu_read(debug_stack_addr) &&
- addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
-}
-NOKPROBE_SYMBOL(is_debug_stack);
-
DEFINE_PER_CPU(u32, debug_idt_ctr);
void debug_stack_set_zero(void)
@@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu)
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init for 64 bit
*/
#ifdef CONFIG_X86_64
void cpu_init(void)
{
- struct orig_ist *oist;
+ int cpu = raw_smp_processor_id();
struct task_struct *me;
struct tss_struct *t;
- unsigned long v;
- int cpu = raw_smp_processor_id();
int i;
wait_for_master_cpu(cpu);
@@ -1715,7 +1681,6 @@ void cpu_init(void)
load_ucode_ap();
t = &per_cpu(cpu_tss_rw, cpu);
- oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
@@ -1753,16 +1718,11 @@ void cpu_init(void)
/*
* set up and load the per-CPU TSS
*/
- if (!oist->ist[0]) {
- char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
-
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += exception_stack_sizes[v];
- oist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- if (v == DEBUG_STACK-1)
- per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
- }
+ if (!t->x86_tss.ist[0]) {
+ t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
}
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index cd53f3030e40..64a59d726639 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack <= begin || stack > end)
+ if (stack < begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
@@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack <= begin || stack > end)
+ if (stack < begin || stack > end)
return false;
info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5cdb9e84da57..753b8cfe8b8a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,23 +16,21 @@
#include <linux/bug.h>
#include <linux/nmi.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
-static char *exception_stack_names[N_EXCEPTION_STACKS] = {
- [ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ NMI_STACK-1 ] = "NMI",
- [ DEBUG_STACK-1 ] = "#DB",
- [ MCE_STACK-1 ] = "#MC",
-};
-
-static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
+static const char * const exception_stack_names[] = {
+ [ ESTACK_DF ] = "#DF",
+ [ ESTACK_NMI ] = "NMI",
+ [ ESTACK_DB2 ] = "#DB2",
+ [ ESTACK_DB1 ] = "#DB1",
+ [ ESTACK_DB ] = "#DB",
+ [ ESTACK_MCE ] = "#MC",
};
const char *stack_type_name(enum stack_type type)
{
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
if (type == STACK_TYPE_IRQ)
return "IRQ";
@@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type)
return NULL;
}
+/**
+ * struct estack_pages - Page descriptor for exception stacks
+ * @offs: Offset from the start of the exception stack area
+ * @size: Size of the exception stack
+ * @type: Type to store in the stack_info struct
+ */
+struct estack_pages {
+ u32 offs;
+ u16 size;
+ u16 type;
+};
+
+#define EPAGERANGE(st) \
+ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \
+ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \
+ .offs = CEA_ESTACK_OFFS(st), \
+ .size = CEA_ESTACK_SIZE(st), \
+ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }
+
+/*
+ * Array of exception stack page descriptors. If the stack is larger than
+ * PAGE_SIZE, all pages covering a particular stack will have the same
+ * info. The guard pages including the not mapped DB2 stack are zeroed
+ * out.
+ */
+static const
+struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
+ EPAGERANGE(DF),
+ EPAGERANGE(NMI),
+ EPAGERANGE(DB1),
+ EPAGERANGE(DB),
+ EPAGERANGE(MCE),
+};
+
static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin, *end;
+ unsigned long begin, end, stk = (unsigned long)stack;
+ const struct estack_pages *ep;
struct pt_regs *regs;
- unsigned k;
+ unsigned int k;
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
- begin = end - (exception_stack_sizes[k] / sizeof(long));
- regs = (struct pt_regs *)end - 1;
-
- if (stack <= begin || stack >= end)
- continue;
+ begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ end = begin + sizeof(struct cea_exception_stacks);
+ /* Bail if @stack is outside the exception stack area. */
+ if (stk < begin || stk >= end)
+ return false;
- info->type = STACK_TYPE_EXCEPTION + k;
- info->begin = begin;
- info->end = end;
- info->next_sp = (unsigned long *)regs->sp;
+ /* Calc page offset from start of exception stacks */
+ k = (stk - begin) >> PAGE_SHIFT;
+ /* Lookup the page descriptor */
+ ep = &estack_pages[k];
+ /* Guard page? */
+ if (!ep->size)
+ return false;
- return true;
- }
+ begin += (unsigned long)ep->offs;
+ end = begin + (unsigned long)ep->size;
+ regs = (struct pt_regs *)end - 1;
- return false;
+ info->type = ep->type;
+ info->begin = (unsigned long *)begin;
+ info->end = (unsigned long *)end;
+ info->next_sp = (unsigned long *)regs->sp;
+ return true;
}
static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
/*
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack <= begin || stack > end)
+ if (stack < begin || stack >= end)
return false;
info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d1dbe8e4eb82..bcd206c8ac90 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -265,7 +265,7 @@ ENDPROC(start_cpu0)
GLOBAL(initial_code)
.quad x86_64_start_kernel
GLOBAL(initial_gs)
- .quad INIT_PER_CPU_VAR(irq_stack_union)
+ .quad INIT_PER_CPU_VAR(fixed_percpu_data)
GLOBAL(initial_stack)
/*
* The SIZEOF_PTREGS gap is a convention which helps the in-kernel
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 01adea278a71..6d8917875f44 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -41,13 +41,12 @@ struct idt_data {
#define SYSG(_vector, _addr) \
G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
-/* Interrupt gate with interrupt stack */
+/*
+ * Interrupt gate with interrupt stack. The _ist index is the index in
+ * the tss.ist[] array, but for the descriptor it needs to start at 1.
+ */
#define ISTG(_vector, _addr, _ist) \
- G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
-
-/* System interrupt gate with interrupt stack */
-#define SISTG(_vector, _addr, _ist) \
- G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
/* Task gate */
#define TSKG(_vector, _gdt) \
@@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
* cpu_init() when the TSS has been initialized.
*/
static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
- ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
- ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK),
+ ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
+ ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
+ ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, &machine_check, MCE_STACK),
+ ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
#endif
};
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 95600a99ae93..fc34816c6f04 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
-DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
+DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
static void call_on_stack(void *func, void *stack)
{
@@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
u32 *isp, *prev_esp, arg1;
curstk = (struct irq_stack *) current_stack();
- irqstk = __this_cpu_read(hardirq_stack);
+ irqstk = __this_cpu_read(hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
}
/*
- * allocate per-cpu stacks for hardirq and for softirq processing
+ * Allocate per-cpu stacks for hardirq and softirq processing
*/
-void irq_ctx_init(int cpu)
+int irq_init_percpu_irqstack(unsigned int cpu)
{
- struct irq_stack *irqstk;
-
- if (per_cpu(hardirq_stack, cpu))
- return;
+ int node = cpu_to_node(cpu);
+ struct page *ph, *ps;
- irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREADINFO_GFP,
- THREAD_SIZE_ORDER));
- per_cpu(hardirq_stack, cpu) = irqstk;
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
- irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREADINFO_GFP,
- THREAD_SIZE_ORDER));
- per_cpu(softirq_stack, cpu) = irqstk;
+ ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ph)
+ return -ENOMEM;
+ ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ps) {
+ __free_pages(ph, THREAD_SIZE_ORDER);
+ return -ENOMEM;
+ }
- printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
+ per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
+ return 0;
}
void do_softirq_own_stack(void)
@@ -135,7 +136,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- irqstk = __this_cpu_read(softirq_stack);
+ irqstk = __this_cpu_read(softirq_stack_ptr);
/* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 0469cd078db1..6bf6517a05bb 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,63 +18,64 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/sched/task_stack.h>
+
+#include <asm/cpu_entry_area.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
-int sysctl_panic_on_stackoverflow;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
+DECLARE_INIT_PER_CPU(irq_stack_backing_store);
-/*
- * Probabilistic stack overflow check:
- *
- * Only check the stack in process context, because everything else
- * runs on the big interrupt stacks. Checking reliably is too expensive,
- * so we just check from interrupts.
- */
-static inline void stack_overflow_check(struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-#define STACK_TOP_MARGIN 128
- struct orig_ist *oist;
- u64 irq_stack_top, irq_stack_bottom;
- u64 estack_top, estack_bottom;
- u64 curbase = (u64)task_stack_page(current);
+ if (IS_ERR_OR_NULL(desc))
+ return false;
- if (user_mode(regs))
- return;
+ generic_handle_irq_desc(desc);
+ return true;
+}
- if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
- regs->sp <= curbase + THREAD_SIZE)
- return;
+#ifdef CONFIG_VMAP_STACK
+/*
+ * VMAP the backing store with guard pages
+ */
+static int map_irq_stack(unsigned int cpu)
+{
+ char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu);
+ struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE];
+ void *va;
+ int i;
- irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) +
- STACK_TOP_MARGIN;
- irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
- if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
- return;
+ for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) {
+ phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT));
- oist = this_cpu_ptr(&orig_ist);
- estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
- estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
- if (regs->sp >= estack_top && regs->sp <= estack_bottom)
- return;
+ pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
+ }
- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
- current->comm, curbase, regs->sp,
- irq_stack_top, irq_stack_bottom,
- estack_top, estack_bottom, (void *)regs->ip);
+ va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+ if (!va)
+ return -ENOMEM;
- if (sysctl_panic_on_stackoverflow)
- panic("low stack detected by irq handler - check messages\n");
-#endif
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ return 0;
}
-
-bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+#else
+/*
+ * If VMAP stacks are disabled due to KASAN, just use the per cpu
+ * backing store without guard pages.
+ */
+static int map_irq_stack(unsigned int cpu)
{
- stack_overflow_check(regs);
+ void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
- if (IS_ERR_OR_NULL(desc))
- return false;
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ return 0;
+}
+#endif
- generic_handle_irq_desc(desc);
- return true;
+int irq_init_percpu_irqstack(unsigned int cpu)
+{
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
+ return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a0693b71cfc1..16919a9671fa 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -91,6 +91,8 @@ void __init init_IRQ(void)
for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
+ BUG_ON(irq_init_percpu_irqstack(smp_processor_id()));
+
x86_init.irqs.intr_init();
}
@@ -104,6 +106,4 @@ void __init native_init_IRQ(void)
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
-
- irq_ctx_init(smp_processor_id());
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 18bc9b51ac9b..3755d0310026 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -21,13 +21,14 @@
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/atomic.h>
#include <linux/sched/clock.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
-#include <linux/atomic.h>
+#include <asm/cpu_entry_area.h>
#include <asm/traps.h>
#include <asm/mach_traps.h>
#include <asm/nmi.h>
@@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2);
* switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+
+static bool notrace is_debug_stack(unsigned long addr)
+{
+ struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
+ unsigned long top = CEA_ESTACK_TOP(cs, DB);
+ unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
+
+ if (__this_cpu_read(debug_stack_usage))
+ return true;
+ /*
+ * Note, this covers the guard page between DB and DB1 as well to
+ * avoid two checks. But by all means @addr can never point into
+ * the guard page.
+ */
+ return addr >= bot && addr < top;
+}
+NOKPROBE_SYMBOL(is_debug_stack);
#endif
dotraplinkage notrace void
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4bf46575568a..86663874ef04 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void)
per_cpu(x86_cpu_to_logical_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif
-#ifdef CONFIG_X86_64
- per_cpu(irq_stack_ptr, cpu) =
- per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE;
-#endif
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ce1a67b70168..c92b21f9e9dc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -935,20 +935,27 @@ out:
return boot_error;
}
-void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
{
+ int ret;
+
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
per_cpu(current_task, cpu) = idle;
+ /* Initialize the interrupt stack(s) */
+ ret = irq_init_percpu_irqstack(cpu);
+ if (ret)
+ return ret;
+
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- irq_ctx_init(cpu);
per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
+ return 0;
}
/*
@@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
/* the FPU context is blank, nobody can own it */
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
- common_cpu_up(cpu, tidle);
+ err = common_cpu_up(cpu, tidle);
+ if (err)
+ return err;
err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
if (err) {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a5127b2c195f..4d1517022a14 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -403,7 +403,8 @@ SECTIONS
*/
#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
+INIT_PER_CPU(fixed_percpu_data);
+INIT_PER_CPU(irq_stack_backing_store);
/*
* Build-time check on the image size:
@@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union);
"kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP
-. = ASSERT((irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
+. = ASSERT((fixed_percpu_data == 0),
+ "fixed_percpu_data is not at start of per-cpu area");
#endif
#endif /* CONFIG_X86_32 */