From 98b0bf02738004829d7e26d6cb47b2e469aaba86 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 14 Aug 2020 21:21:05 +0800 Subject: selftests: kvm: Use a shorter encoding to clear RAX If debug_regs.c is built with newer binutils, the resulting binary is "optimized" by the assembler: asm volatile("ss_start: " "xor %%rax,%%rax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" : : : "rax", "ecx"); is translated to : 000000000040194e : 40194e: 31 c0 xor %eax,%eax <----- rax->eax? 401950: 0f a2 cpuid 401952: b9 a0 01 00 00 mov $0x1a0,%ecx 401957: 0f 32 rdmsr As you can see rax is replaced with eax in target binary code. This causes a difference is the length of xor instruction (2 Byte vs 3 Byte), and makes the hard-coded instruction length check fail: /* Instruction lengths starting at ss_start */ int ss_size[4] = { 3, /* xor */ <-------- 2 or 3? 2, /* cpuid */ 5, /* mov */ 2, /* rdmsr */ }; Encode the shorter version directly and, while at it, fix the "clobbers" of the asm. Cc: stable@vger.kernel.org Signed-off-by: Yang Weijiang Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/debug_regs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 8162c58a1234..b8d14f9db5f9 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -40,11 +40,11 @@ static void guest_code(void) /* Single step test, covers 2 basic instructions and 2 emulated */ asm volatile("ss_start: " - "xor %%rax,%%rax\n\t" + "xor %%eax,%%eax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" - : : : "rax", "ecx"); + : : : "eax", "ebx", "ecx", "edx"); /* DR6.BD test */ asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax"); -- cgit v1.2.3 From 19cf4b7eefc3040192bccb10c322e4c831bb0eb0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 17 Aug 2020 13:53:04 -0400 Subject: KVM: x86: fix access code passed to gva_to_gpa The PK bit of the error code is computed dynamically in permission_fault and therefore need not be passed to gva_to_gpa: only the access bits (fetch, user, write) need to be passed down. Not doing so causes a splat in the pku test: WARNING: CPU: 25 PID: 5465 at arch/x86/kvm/mmu.h:197 paging64_walk_addr_generic+0x594/0x750 [kvm] Hardware name: Intel Corporation WilsonCity/WilsonCity, BIOS WLYDCRB1.SYS.0014.D62.2001092233 01/09/2020 RIP: 0010:paging64_walk_addr_generic+0x594/0x750 [kvm] Code: <0f> 0b e9 db fe ff ff 44 8b 43 04 4c 89 6c 24 30 8b 13 41 39 d0 89 RSP: 0018:ff53778fc623fb60 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ff53778fc623fbf0 RCX: 0000000000000007 RDX: 0000000000000001 RSI: 0000000000000002 RDI: ff4501efba818000 RBP: 0000000000000020 R08: 0000000000000005 R09: 00000000004000e7 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000007 R13: ff4501efba818388 R14: 10000000004000e7 R15: 0000000000000000 FS: 00007f2dcf31a700(0000) GS:ff4501f1c8040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000001dea475005 CR4: 0000000000763ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: paging64_gva_to_gpa+0x3f/0xb0 [kvm] kvm_fixup_and_inject_pf_error+0x48/0xa0 [kvm] handle_exception_nmi+0x4fc/0x5b0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0x911/0x1c10 [kvm] kvm_vcpu_ioctl+0x23e/0x5d0 [kvm] ksys_ioctl+0x92/0xb0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x3e/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace d17eb998aee991da ]--- Reported-by: Sean Christopherson Fixes: 897861479c064 ("KVM: x86: Add helper functions for illegal GPA checking and page fault injection") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2db369a64f29..fcfd79a02e85 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10743,9 +10743,11 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { struct x86_exception fault; + u32 access = error_code & + (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) { + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed -- cgit v1.2.3 From 427890aff8558eb4326e723835e0eae0e6fe3102 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 17 Aug 2020 11:16:55 -0700 Subject: kvm: x86: Toggling CR4.SMAP does not load PDPTEs in PAE mode See the SDM, volume 3, section 4.4.1: If PAE paging would be in use following an execution of MOV to CR0 or MOV to CR4 (see Section 4.1.1) and the instruction is modifying any of CR0.CD, CR0.NW, CR0.PG, CR4.PAE, CR4.PGE, CR4.PSE, or CR4.SMEP; then the PDPTEs are loaded from the address in CR3. Fixes: 0be0226f07d14 ("KVM: MMU: fix SMAP virtualization") Cc: Xiao Guangrong Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Message-Id: <20200817181655.3716509-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fcfd79a02e85..ecc3470b279e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,7 +975,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + X86_CR4_SMEP | X86_CR4_PKE; if (kvm_valid_cr4(vcpu, cr4)) return 1; -- cgit v1.2.3 From cb957adb4ea422bd758568df5b2478ea3bb34f35 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 17 Aug 2020 11:16:54 -0700 Subject: kvm: x86: Toggling CR4.PKE does not load PDPTEs in PAE mode See the SDM, volume 3, section 4.4.1: If PAE paging would be in use following an execution of MOV to CR0 or MOV to CR4 (see Section 4.1.1) and the instruction is modifying any of CR0.CD, CR0.NW, CR0.PG, CR4.PAE, CR4.PGE, CR4.PSE, or CR4.SMEP; then the PDPTEs are loaded from the address in CR3. Fixes: b9baba8614890 ("KVM, pkeys: expose CPUID/CR4 to guest") Cc: Huaitong Han Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Message-Id: <20200817181655.3716509-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecc3470b279e..539ea1cd6020 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,7 +975,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_PKE; + X86_CR4_SMEP; if (kvm_valid_cr4(vcpu, cr4)) return 1; -- cgit v1.2.3 From fdfe7cbd58806522e799e2a50a15aee7f2cbb7b6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 11 Aug 2020 11:27:24 +0100 Subject: KVM: Pass MMU notifier range flags to kvm_unmap_hva_range() The 'flags' field of 'struct mmu_notifier_range' is used to indicate whether invalidate_range_{start,end}() are permitted to block. In the case of kvm_mmu_notifier_invalidate_range_start(), this field is not forwarded on to the architecture-specific implementation of kvm_unmap_hva_range() and therefore the backend cannot sensibly decide whether or not to block. Add an extra 'flags' parameter to kvm_unmap_hva_range() so that architectures are aware as to whether or not they are permitted to block. Cc: Cc: Marc Zyngier Cc: Suzuki K Poulose Cc: James Morse Signed-off-by: Will Deacon Message-Id: <20200811102725.7121-2-will@kernel.org> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/mmu.c | 2 +- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/mmu.c | 3 ++- arch/powerpc/include/asm/kvm_host.h | 3 ++- arch/powerpc/kvm/book3s.c | 3 ++- arch/powerpc/kvm/e500_mmu_host.c | 3 ++- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 3 ++- virt/kvm/kvm_main.c | 3 ++- 10 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65568b23868a..e52c927aade5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -473,7 +473,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, unsigned flags); int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0121ef2c7c8d..dc351802ff18 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2213,7 +2213,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat } int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, unsigned flags) { if (!kvm->arch.mmu.pgd) return 0; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d35eaed1668f..825d337a505a 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -969,7 +969,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, unsigned flags); int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 87fa8d8a1031..28c366d307e7 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -486,7 +486,8 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return 1; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + unsigned flags) { handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e020d269416d..10ded83414de 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -58,7 +58,8 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER extern int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + unsigned flags); extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 41fedec69ac3..49db50d1db04 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -834,7 +834,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + unsigned flags) { return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); } diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index d6c1069e9954..ed0c9c43d0cf 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -734,7 +734,8 @@ static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + unsigned flags) { /* kvm_unmap_hva flushes everything anyways */ kvm_unmap_hva(kvm, start); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ab3af7275d8..5303dbc5c9bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1596,7 +1596,8 @@ asmlinkage void kvm_spurious_fault(void); _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + unsigned flags); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4e03841f053d..a5d0207e7189 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1916,7 +1916,8 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + unsigned flags) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2c2c0254c2d8..4eaa4e46c7d0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -482,7 +482,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * count is also read inside the mmu_lock critical section. */ kvm->mmu_notifier_count++; - need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); + need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, + range->flags); need_tlb_flush |= kvm->tlbs_dirty; /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) -- cgit v1.2.3 From b5331379bc62611d1026173a09c73573384201d9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 11 Aug 2020 11:27:25 +0100 Subject: KVM: arm64: Only reschedule if MMU_NOTIFIER_RANGE_BLOCKABLE is not set When an MMU notifier call results in unmapping a range that spans multiple PGDs, we end up calling into cond_resched_lock() when crossing a PGD boundary, since this avoids running into RCU stalls during VM teardown. Unfortunately, if the VM is destroyed as a result of OOM, then blocking is not permitted and the call to the scheduler triggers the following BUG(): | BUG: sleeping function called from invalid context at arch/arm64/kvm/mmu.c:394 | in_atomic(): 1, irqs_disabled(): 0, non_block: 1, pid: 36, name: oom_reaper | INFO: lockdep is turned off. | CPU: 3 PID: 36 Comm: oom_reaper Not tainted 5.8.0 #1 | Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015 | Call trace: | dump_backtrace+0x0/0x284 | show_stack+0x1c/0x28 | dump_stack+0xf0/0x1a4 | ___might_sleep+0x2bc/0x2cc | unmap_stage2_range+0x160/0x1ac | kvm_unmap_hva_range+0x1a0/0x1c8 | kvm_mmu_notifier_invalidate_range_start+0x8c/0xf8 | __mmu_notifier_invalidate_range_start+0x218/0x31c | mmu_notifier_invalidate_range_start_nonblock+0x78/0xb0 | __oom_reap_task_mm+0x128/0x268 | oom_reap_task+0xac/0x298 | oom_reaper+0x178/0x17c | kthread+0x1e4/0x1fc | ret_from_fork+0x10/0x30 Use the new 'flags' argument to kvm_unmap_hva_range() to ensure that we only reschedule if MMU_NOTIFIER_RANGE_BLOCKABLE is set in the notifier flags. Cc: Fixes: 8b3405e345b5 ("kvm: arm/arm64: Fix locking for kvm_free_stage2_pgd") Cc: Marc Zyngier Cc: Suzuki K Poulose Cc: James Morse Signed-off-by: Will Deacon Message-Id: <20200811102725.7121-3-will@kernel.org> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/mmu.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index dc351802ff18..ba00bcc0c884 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -343,7 +343,8 @@ static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, * destroying the VM), otherwise another faulting VCPU may come in and mess * with things behind our backs. */ -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, + bool may_block) { struct kvm *kvm = mmu->kvm; pgd_t *pgd; @@ -369,11 +370,16 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si * If the range is too large, release the kvm->mmu_lock * to prevent starvation and lockup detector warnings. */ - if (next != end) + if (may_block && next != end) cond_resched_lock(&kvm->mmu_lock); } while (pgd++, addr = next, addr != end); } +static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +{ + __unmap_stage2_range(mmu, start, size, true); +} + static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { @@ -2208,7 +2214,10 @@ static int handle_hva_to_gpa(struct kvm *kvm, static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + unsigned flags = *(unsigned *)data; + bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; + + __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block); return 0; } @@ -2219,7 +2228,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, return 0; trace_kvm_unmap_hva_range(start, end); - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); + handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); return 0; } -- cgit v1.2.3