From e504e74cc3a2c092b05577ce3e8e013fae7d94e6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 5 Feb 2021 08:24:02 -0600 Subject: x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2 KASAN reserves "redzone" areas between stack frames in order to detect stack overruns. A read or write to such an area triggers a KASAN "stack-out-of-bounds" BUG. Normally, the ORC unwinder stays in-bounds and doesn't access the redzone. But sometimes it can't find ORC metadata for a given instruction. This can happen for code which is missing ORC metadata, or for generated code. In such cases, the unwinder attempts to fall back to frame pointers, as a best-effort type thing. This fallback often works, but when it doesn't, the unwinder can get confused and go off into the weeds into the KASAN redzone, triggering the aforementioned KASAN BUG. But in this case, the unwinder's confusion is actually harmless and working as designed. It already has checks in place to prevent off-stack accesses, but those checks get short-circuited by the KASAN BUG. And a BUG is a lot more disruptive than a harmless unwinder warning. Disable the KASAN checks by using READ_ONCE_NOCHECK() for all stack accesses. This finishes the job started by commit 881125bfe65b ("x86/unwind: Disable KASAN checking in the ORC unwinder"), which only partially fixed the issue. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reported-by: Ivan Babrou Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Steven Rostedt (VMware) Tested-by: Ivan Babrou Cc: stable@kernel.org Link: https://lkml.kernel.org/r/9583327904ebbbeda399eca9c56d6c7085ac20fe.1612534649.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2a1d47f47eee..1bcc14c870ab 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } -- cgit v1.2.3 From b59cc97674c947861783ca92b9a6e7d043adba96 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 5 Feb 2021 08:24:03 -0600 Subject: x86/unwind/orc: Silence warnings caused by missing ORC data The ORC unwinder attempts to fall back to frame pointers when ORC data is missing for a given instruction. It sets state->error, but then tries to keep going as a best-effort type of thing. That may result in further warnings if the unwinder gets lost. Until we have some way to register generated code with the unwinder, missing ORC will be expected, and occasionally going off the rails will also be expected. So don't warn about it. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Tested-by: Ivan Babrou Link: https://lkml.kernel.org/r/06d02c4bbb220bd31668db579278b0352538efbb.1612534649.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 1bcc14c870ab..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) -- cgit v1.2.3 From 78a81d88f60ba773cbe890205e1ee67f00502948 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 3 Mar 2021 15:17:12 +0100 Subject: x86/sev-es: Introduce ip_within_syscall_gap() helper Introduce a helper to check whether an exception came from the syscall gap and use it in the SEV-ES code. Extend the check to also cover the compatibility SYSCALL entry path. Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # 5.10+ Link: https://lkml.kernel.org/r/20210303141716.29223-2-joro@8bytes.org --- arch/x86/entry/entry_64_compat.S | 2 ++ arch/x86/include/asm/proto.h | 1 + arch/x86/include/asm/ptrace.h | 15 +++++++++++++++ arch/x86/kernel/traps.c | 3 +-- 4 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..b6a9d51d1d79 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include #endif +#include + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7f5aec758f0e..ac1874a2a70e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } -- cgit v1.2.3 From 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 3 Mar 2021 15:17:13 +0100 Subject: x86/sev-es: Check regs->sp is trusted before adjusting #VC IST stack The code in the NMI handler to adjust the #VC handler IST stack is needed in case an NMI hits when the #VC handler is still using its IST stack. But the check for this condition also needs to look if the regs->sp value is trusted, meaning it was not set by user-space. Extend the check to not use regs->sp when the NMI interrupted user-space code or the SYSCALL gap. Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") Reported-by: Andy Lutomirski Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # 5.10+ Link: https://lkml.kernel.org/r/20210303141716.29223-3-joro@8bytes.org --- arch/x86/kernel/sev-es.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 84c1821819af..301f20f6d4dd 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } @@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) + if (on_vc_stack(regs)) new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); else new_ist = old_ist - sizeof(old_ist); -- cgit v1.2.3 From 62441a1fb53263bda349b6e5997c3cc5c120d89e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 3 Mar 2021 15:17:15 +0100 Subject: x86/sev-es: Correctly track IRQ states in runtime #VC handler Call irqentry_nmi_enter()/irqentry_nmi_exit() in the #VC handler to correctly track the IRQ state during its execution. Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler") Reported-by: Andy Lutomirski Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v5.10+ Link: https://lkml.kernel.org/r/20210303141716.29223-5-joro@8bytes.org --- arch/x86/kernel/sev-es.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 301f20f6d4dd..c3fd8fa79838 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -1258,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1273,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1335,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; -- cgit v1.2.3 From bffe30dd9f1f3b2608a87ac909a224d6be472485 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 3 Mar 2021 15:17:16 +0100 Subject: x86/sev-es: Use __copy_from_user_inatomic() The #VC handler must run in atomic context and cannot sleep. This is a problem when it tries to fetch instruction bytes from user-space via copy_from_user(). Introduce a insn_fetch_from_user_inatomic() helper which uses __copy_from_user_inatomic() to safely copy the instruction bytes to kernel memory in the #VC handler. Fixes: 5e3427a7bc432 ("x86/sev-es: Handle instruction fetches from user-space") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v5.10+ Link: https://lkml.kernel.org/r/20210303141716.29223-6-joro@8bytes.org --- arch/x86/include/asm/insn-eval.h | 2 ++ arch/x86/kernel/sev-es.c | 2 +- arch/x86/lib/insn-eval.c | 66 +++++++++++++++++++++++++++++++--------- 3 files changed, 55 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index c3fd8fa79838..04a780abb512 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -258,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..bb0b3fe1e0a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } -- cgit v1.2.3 From d7eb79c6290c7ae4561418544072e0a3266e7384 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 24 Feb 2021 09:37:29 +0800 Subject: KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged # lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 88 On-line CPU(s) list: 0-63 Off-line CPU(s) list: 64-87 # cat /proc/cmdline BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall # echo 1 > /sys/devices/system/cpu/cpu76/online -bash: echo: write error: Cannot allocate memory The per-cpu vsyscall pvclock data pointer assigns either an element of the static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall and sometimes you end up with vsyscall disabled if the host does something strange. This patch fixes it by allocating this dynamically memory unconditionally even if vsyscall is disabled. Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") Reported-by: Zelin Deng Cc: Brijesh Singh Cc: stable@vger.kernel.org#v4.19-rc5+ Signed-off-by: Wanpeng Li Message-Id: <1614130683-24137-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf6..1fc0962c89c0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); -- cgit v1.2.3