diff options
author | Ingo Molnar <mingo@kernel.org> | 2019-12-10 10:11:00 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2019-12-10 10:11:00 +0100 |
commit | 2040cf9f59037aa8aec749363e69ead165b67b43 (patch) | |
tree | e9c15448e841cc493bc80b9f658d7955623e86dd /arch/x86/kernel | |
parent | f66c0447cca1281116224d474cdb37d6a18e4b5b (diff) | |
parent | e42617b825f8073569da76dc4510bfa019b1c35a (diff) | |
download | linux-2040cf9f59037aa8aec749363e69ead165b67b43.tar.bz2 |
Merge tag 'v5.5-rc1' into core/kprobes, to resolve conflicts
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/kernel/amd_gart_64.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/therm_throt.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/doublefault.c | 86 | ||||
-rw-r--r-- | arch/x86/kernel/doublefault_32.c | 136 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack_32.c | 30 | ||||
-rw-r--r-- | arch/x86/kernel/e820.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace_64.S | 42 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 52 | ||||
-rw-r--r-- | arch/x86/kernel/ptrace.c | 36 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 31 |
15 files changed, 324 insertions, 172 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 32acb970f416..6175e370ee4a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault.o +ifeq ($(CONFIG_X86_32),y) +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +endif obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 4bbccb9d16dc..4e5f50236048 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || !dma_capable(dev, addr, size); + return force_iommu || !dma_capable(dev, addr, size, true); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !dma_capable(dev, addr, size); + return !dma_capable(dev, addr, size, true); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index baa2fed8deb6..2e4d90294fe6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -24,6 +24,7 @@ #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> +#include <asm/doublefault.h> #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } -static inline void gdt_setup_doublefault_tss(int cpu) { } - #else /* CONFIG_X86_64 */ static inline void setup_getcpu(int cpu) { } @@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu) static inline void tss_setup_ist(struct tss_struct *tss) { } -static inline void gdt_setup_doublefault_tss(int cpu) -{ -#ifdef CONFIG_DOUBLEFAULT - /* Set up the doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif -} #endif /* !CONFIG_X86_64 */ static inline void tss_setup_io_bitmap(struct tss_struct *tss) @@ -1923,7 +1915,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - gdt_setup_doublefault_tss(cpu); + doublefault_init_cpu_tss(); fpu__init_cpu(); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index d01e0da0163a..b38010b541d6 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -195,17 +195,24 @@ static const struct attribute_group thermal_attr_group = { #define THERM_THROT_POLL_INTERVAL HZ #define THERM_STATUS_PROCHOT_LOG BIT(1) +#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) + static void clear_therm_status_log(int level) { int msr; - u64 msr_val; + u64 mask, msr_val; - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; + if (level == CORE_LEVEL) { + msr = MSR_IA32_THERM_STATUS; + mask = THERM_STATUS_CLEAR_CORE_MASK; + } else { + msr = MSR_IA32_PACKAGE_THERM_STATUS; + mask = THERM_STATUS_CLEAR_PKG_MASK; + } rdmsrl(msr, msr_val); + msr_val &= mask; wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); } diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c deleted file mode 100644 index 0d6c657593f8..000000000000 --- a/arch/x86/kernel/doublefault.c +++ /dev/null @@ -1,86 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/sched/debug.h> -#include <linux/init_task.h> -#include <linux/fs.h> - -#include <linux/uaccess.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> - -#ifdef CONFIG_X86_32 - -#define DOUBLEFAULT_STACKSIZE (1024) -static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; -#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) - -#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) - -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; - - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; - - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); - - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); - - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); - - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } - - for (;;) - cpu_relax(); -} - -struct x86_hw_tss doublefault_tss __cacheline_aligned = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, -#ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, -#endif - - .__cr3 = __pa_nodebug(swapper_pg_dir), -}; - -/* dummy for do_double_fault() call */ -void df_debug(struct pt_regs *regs, long error_code) {} - -#else /* !CONFIG_X86_32 */ - -void df_debug(struct pt_regs *regs, long error_code) -{ - pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); - show_regs(regs); - panic("Machine halted."); -} -#endif diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000000..3793646f0fb5 --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/init_task.h> +#include <linux/fs.h> + +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/traps.h> + +extern void double_fault(void); +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) + +static void set_df_gdt_entry(unsigned int cpu); + +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage notrace void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; + + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + + cr2 = native_read_cr2(); + + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); + + trace_hardirqs_off(); + + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from do_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + do_double_fault(®s, 0, cr2); + + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); +} +NOKPROBE_SYMBOL(doublefault_shim); + +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + + .ip = (unsigned long) double_fault, + .flags = X86_EFLAGS_FIXED, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif + + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, +}; + +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + set_df_gdt_entry(cpu); +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 64a59d726639..8e3a8fedfa4d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_ENTRY) return "ENTRY_TRAMPOLINE"; + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + return NULL; } @@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) return true; } +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +#else + return false; +#endif +} + + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { @@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_softirq_stack(stack, info)) goto recursion_check; + if (in_doublefault_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0bfe9a685b3b..c5399e80c59c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type) case E820_TYPE_RAM: /* Fall through: */ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; @@ -1048,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; case E820_TYPE_PMEM: return "Persistent Memory"; case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; default: return "Unknown E820 type"; } } @@ -1063,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) case E820_TYPE_PRAM: /* Fall-through: */ case E820_TYPE_PMEM: /* Fall-through: */ case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ default: return IORESOURCE_MEM; } } @@ -1075,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ @@ -1089,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) return true; /* - * Treat persistent memory like device memory, i.e. reserve it - * for exclusive use of a driver + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver */ switch (type) { case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; @@ -1296,6 +1301,9 @@ void __init e820__memblock_setup(void) if (end != (resource_size_t)end) continue; + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 108ee96f8b66..07c32d5447e4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -607,6 +607,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, return; /* + * If the return location is actually pointing directly to + * the start of a direct trampoline (if we trace the trampoline + * it will still be offset by MCOUNT_INSN_SIZE), then the + * return address is actually off by one word, and we + * need to adjust for that. + */ + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } + } + + /* * Protect against fault, even if it shouldn't * happen. This tool is too much intrusive to * ignore such a protection. diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 6e8961ca3605..369e61faacfe 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -85,6 +85,7 @@ movq %rdi, RDI(%rsp) movq %r8, R8(%rsp) movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) /* * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. @@ -111,7 +112,11 @@ subq $MCOUNT_INSN_SIZE, %rdi .endm -.macro restore_mcount_regs +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + movq R9(%rsp), %r9 movq R8(%rsp), %r8 movq RDI(%rsp), %rdi @@ -120,10 +125,7 @@ movq RCX(%rsp), %rcx movq RAX(%rsp), %rax - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp + addq $MCOUNT_REG_SIZE-\save, %rsp .endm @@ -174,6 +176,8 @@ SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq + UNWIND_HINT_SAVE + /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -226,7 +230,33 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq R10(%rsp), %r10 movq RBX(%rsp), %rbx - restore_mcount_regs + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* If ORIG_RAX is anything but zero, make this a call to that */ + movq ORIG_RAX(%rsp), %rax + cmpq $0, %rax + je 1f + + /* Swap the flags with orig_rax */ + movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + + jmp 2f + +1: restore_mcount_regs + + +2: + /* + * The stack layout is nondetermistic here, depending on which path was + * taken. This confuses objtool and ORC, rightfully so. For now, + * pretend the stack always looks like the non-direct case. + */ + UNWIND_HINT_RESTORE /* Restore flags */ popfq diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 57de2ebff7e2..5dcedad21dff 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -140,7 +140,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd2a11ca5dd6..61e93a318983 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) void tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct thread_struct *t = ¤t->thread; u16 *base = &tss->x86_tss.io_bitmap_base; - if (test_thread_flag(TIF_IO_BITMAP)) { - struct thread_struct *t = ¤t->thread; - - if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { - *base = IO_BITMAP_OFFSET_VALID_ALL; - } else { - struct io_bitmap *iobm = t->io_bitmap; - /* - * Only copy bitmap data when the sequence number - * differs. The update time is accounted to the - * incoming task. - */ - if (tss->io_bitmap.prev_sequence != iobm->sequence) - tss_copy_io_bitmap(tss, iobm); - - /* Enable the bitmap */ - *base = IO_BITMAP_OFFSET_VALID_MAP; - } + if (!test_thread_flag(TIF_IO_BITMAP)) { + tss_invalidate_io_bitmap(tss); + return; + } + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + /* - * Make sure that the TSS limit is covering the io bitmap. - * It might have been cut down by a VMEXIT to 0x67 which - * would cause a subsequent I/O access from user space to - * trigger a #GP because tbe bitmap is outside the TSS - * limit. + * Only copy bitmap data when the sequence number differs. The + * update time is accounted to the incoming task. */ - refresh_tss_limit(); - } else { - tss_invalidate_io_bitmap(tss); + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; } + + /* + * Make sure that the TSS limit is covering the IO bitmap. It might have + * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O + * access from user space to trigger a #GP because tbe bitmap is outside + * the TSS limit. + */ + refresh_tss_limit(); } #else /* CONFIG_X86_IOPL_IOPERM */ static inline void switch_to_bitmap(unsigned long tifp) { } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 066e5b01a7e0..f0e1ddbc2fd7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ @@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - if (task == current) - set_user_gs(task_pt_regs(task), value); - else - task_user_gs(task) = value; + task_user_gs(task) = value; } return 0; @@ -272,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; + /* + * This function has some ABI oddities. + * + * A 32-bit ptracer probably expects that writing FS or GS will change + * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, + * this code indeed has that effect. When FSGSBASE is added, this + * will require a special case. + * + * For existing 64-bit ptracers, writing FS or GS *also* currently + * changes the base if the selector is nonzero the next time the task + * is run. This behavior may not be needed, and trying to preserve it + * when FSGSBASE is added would be complicated at best. + */ + switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; - if (task == current) - loadsegment(fs, task->thread.fsindex); break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; - if (task == current) - load_gs_index(task->thread.gsindex); break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; - if (task == current) - loadsegment(ds, task->thread.ds); break; case offsetof(struct user_regs_struct,es): task->thread.es = value; - if (task == current) - loadsegment(es, task->thread.es); break; /* @@ -375,6 +384,9 @@ static int putreg(struct task_struct *child, * When changing the FS base, use do_arch_prctl_64() * to set the index to zero and to set the base * as requested. + * + * NB: This behavior is nonsensical and likely needs to + * change when FSGSBASE support is added. */ if (child->thread.fsbase != value) return do_arch_prctl_64(child, ARCH_SET_FS, value); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d398afd206b8..cedfe2077a69 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1138,17 +1138,15 @@ void __init setup_arch(char **cmdline_p) reserve_bios_regions(); - if (efi_enabled(EFI_MEMMAP)) { - efi_fake_memmap(); - efi_find_mirror(); - efi_esrt_init(); + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be - * called after ExitBootServices(). This is, in fact, a lie. - */ - efi_reserve_boot_services(); - } + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ e820__memblock_alloc_reserved_mpc_new(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3451a004e162..f19de6f45d48 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; @@ -411,15 +426,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); #endif -#ifdef CONFIG_DOUBLEFAULT - df_debug(regs, error_code); -#endif - /* - * This is always a kernel trap and never fixable (and thus must - * never return). - */ - for (;;) - die(str, regs, error_code); + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + die("double fault", regs, error_code); + panic("Machine halted."); } #endif |