From 025768a966a3dde8455de46d1f121a51bacb6a77 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 May 2021 14:07:53 -0700 Subject: x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant We used to generate this constant with static jumps, which certainly works, but generates some quite unreadable and horrid code, and extra jumps. It's actually much simpler to just use our alternative_asm() infrastructure to generate a simple alternative constant, making the generated code much more obvious (and straight-line rather than "jump around to load the right constant"). Acked-by: Borislav Petkov Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Ingo Molnar --- arch/x86/include/asm/page_64.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/page_64_types.h | 23 +++-------------------- 2 files changed, 36 insertions(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 939b1cff4a7b..ca840fec7776 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -56,6 +56,39 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_5LEVEL +/* + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ +static inline unsigned long task_size_max(void) +{ + unsigned long ret; + + alternative_io("movq %[small],%0","movq %[large],%0", + X86_FEATURE_LA57, + "=r" (ret), + [small] "i" ((1ul << 47)-PAGE_SIZE), + [large] "i" ((1ul << 56)-PAGE_SIZE)); + + return ret; +} +#endif /* CONFIG_X86_5LEVEL */ + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 64297eabad63..a8d4ad856568 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -55,30 +55,13 @@ #ifdef CONFIG_X86_5LEVEL #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) +/* See task_size_max() in */ #else #define __VIRTUAL_MASK_SHIFT 47 +#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) #endif -/* - * User space process size. This is the first address outside the user range. - * There are a few constraints that determine this: - * - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical - * address, then that syscall will enter the kernel with a - * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything - * from being mapped at the maximum canonical address. - * - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the - * CPUs malfunction if they execute code from the highest canonical page. - * They'll speculate right off the end of the canonical space, and - * bad things happen. This is worked around in the same way as the - * Intel problem. - * - * With page table isolation enabled, we map the LDT in ... [stay tuned] - */ -#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) - +#define TASK_SIZE_MAX task_size_max() #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm -- cgit v1.2.3 From 790d1ce71de9199bf9fd37c4743aec4a09489a51 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Apr 2021 21:58:40 +0300 Subject: x86: Delete UD0, UD1 traces Both instructions aren't used by kernel. Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YIHHYNKbiSf5N7+o@localhost.localdomain --- arch/x86/include/asm/bug.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 297fa12e7e27..84b87538a15d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,18 +7,9 @@ /* * Despite that some emulators terminate on UD2, we use it for WARN(). - * - * Since various instruction decoders/specs disagree on the encoding of - * UD0/UD1. */ - -#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */ -#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ #define ASM_UD2 ".byte 0x0f, 0x0b" - -#define INSN_UD0 0xff0f #define INSN_UD2 0x0b0f - #define LEN_UD2 2 #ifdef CONFIG_GENERIC_BUG -- cgit v1.2.3 From fc48a6d1faadbf08b7a840d58a5a6eb85bd1a79a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 15:56:32 -0700 Subject: x86/cpu: Remove write_tsc() and write_rdtscp_aux() wrappers Drop write_tsc() and write_rdtscp_aux(); the former has no users, and the latter has only a single user and is slightly misleading since the only in-kernel consumer of MSR_TSC_AUX is RDPID, not RDTSCP. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210504225632.1532621-3-seanjc@google.com --- arch/x86/include/asm/msr.h | 4 ---- arch/x86/kernel/cpu/common.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e16cccdd0420..a3f87f1015d3 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val) return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } -#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) - -#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) - struct msr *msrs_alloc(void); void msrs_free(struct msr *msrs); int msr_set_bit(u32 msr, u8 bit); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490bed07fe35..a1b756c49a93 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1852,7 +1852,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - write_rdtscp_aux(cpudata); + wrmsr(MSR_TSC_AUX, cpudata, 0); /* Store CPU and node number in limit. */ d.limit0 = cpudata; -- cgit v1.2.3 From a217a6593cec8b315d4c2f344bae33660b39b703 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 4 May 2021 21:50:14 +0200 Subject: KVM/VMX: Invoke NMI non-IST entry instead of IST entry In VMX, the host NMI handler needs to be invoked after NMI VM-Exit. Before commit 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn"), this was done by INTn ("int $2"). But INTn microcode is relatively expensive, so the commit reworked NMI VM-Exit handling to invoke the kernel handler by function call. But this missed a detail. The NMI entry point for direct invocation is fetched from the IDT table and called on the kernel stack. But on 64-bit the NMI entry installed in the IDT expects to be invoked on the IST stack. It relies on the "NMI executing" variable on the IST stack to work correctly, which is at a fixed position in the IST stack. When the entry point is unexpectedly called on the kernel stack, the RSP-addressed "NMI executing" variable is obviously also on the kernel stack and is "uninitialized" and can cause the NMI entry code to run in the wrong way. Provide a non-ist entry point for VMX which shares the C-function with the regular NMI entry and invoke the new asm entry point instead. On 32-bit this just maps to the regular NMI entry point as 32-bit has no ISTs and is not affected. [ tglx: Made it independent for backporting, massaged changelog ] Fixes: 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn") Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Tested-by: Lai Jiangshan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87r1imi8i1.ffs@nanos.tec.linutronix.de --- arch/x86/include/asm/idtentry.h | 15 +++++++++++++++ arch/x86/kernel/nmi.c | 10 ++++++++++ arch/x86/kvm/vmx/vmx.c | 16 +++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index e35e342673c7..73d45b0dfff2 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); #endif /* NMI */ + +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +/* + * Special NOIST entry point for VMX which invokes this on the kernel + * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI + * 'executing' marker. + * + * On 32bit this just uses the regular NMI entry point because 32-bit does + * not have ISTs. + */ +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); +#else +#define asm_exc_nmi_noist asm_exc_nmi +#endif + DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bf250a339655..2ef961cf4cfc 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -524,6 +524,16 @@ nmi_restart: mds_user_clear_cpu_buffers(); } +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_noist) +{ + exc_nmi(regs); +} +#endif +#if IS_MODULE(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +#endif + void stop_nmi(void) { ignore_nmis++; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cbe0cdade38a..b21d751407b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) void vmx_do_interrupt_nmi_irqoff(unsigned long entry); -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, + unsigned long entry) { - unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; - kvm_before_interrupt(vcpu); - vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); + handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, intr_info); + handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) -- cgit v1.2.3