diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-23 11:17:56 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-23 11:17:56 -0700 |
commit | f9a705ad1c077ec2872c641f0db9c0d5b4a097bb (patch) | |
tree | 7f5d18d74f700be5bcf72ec5f4955f016eac9ab9 /arch | |
parent | 9313f8026328d0309d093f6774be4b8f5340c0e5 (diff) | |
parent | 29cf0f5007a215b51feb0ae25ca5353480d53ead (diff) | |
download | linux-f9a705ad1c077ec2872c641f0db9c0d5b4a097bb.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"For x86, there is a new alternative and (in the future) more scalable
implementation of extended page tables that does not need a reverse
map from guest physical addresses to host physical addresses.
For now it is disabled by default because it is still lacking a few of
the existing MMU's bells and whistles. However it is a very solid
piece of work and it is already available for people to hammer on it.
Other updates:
ARM:
- New page table code for both hypervisor and guest stage-2
- Introduction of a new EL2-private host context
- Allow EL2 to have its own private per-CPU variables
- Support of PMU event filtering
- Complete rework of the Spectre mitigation
PPC:
- Fix for running nested guests with in-kernel IRQ chip
- Fix race condition causing occasional host hard lockup
- Minor cleanups and bugfixes
x86:
- allow trapping unknown MSRs to userspace
- allow userspace to force #GP on specific MSRs
- INVPCID support on AMD
- nested AMD cleanup, on demand allocation of nested SVM state
- hide PV MSRs and hypercalls for features not enabled in CPUID
- new test for MSR_IA32_TSC writes from host and guest
- cleanups: MMU, CPUID, shared MSRs
- LAPIC latency optimizations ad bugfixes"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (232 commits)
kvm: x86/mmu: NX largepage recovery for TDP MMU
kvm: x86/mmu: Don't clear write flooding count for direct roots
kvm: x86/mmu: Support MMIO in the TDP MMU
kvm: x86/mmu: Support write protection for nesting in tdp MMU
kvm: x86/mmu: Support disabling dirty logging for the tdp MMU
kvm: x86/mmu: Support dirty logging for the TDP MMU
kvm: x86/mmu: Support changed pte notifier in tdp MMU
kvm: x86/mmu: Add access tracking for tdp_mmu
kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU
kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU
kvm: x86/mmu: Add TDP MMU PF handler
kvm: x86/mmu: Remove disallowed_hugepage_adjust shadow_walk_iterator arg
kvm: x86/mmu: Support zapping SPTEs in the TDP MMU
KVM: Cache as_id in kvm_memory_slot
kvm: x86/mmu: Add functions to handle changed TDP SPTEs
kvm: x86/mmu: Allocate and free TDP MMU roots
kvm: x86/mmu: Init / Uninit the TDP MMU
kvm: x86/mmu: Introduce tdp_iter
KVM: mmu: extract spte.h and spte.c
KVM: mmu: Separate updating a PTE from kvm_set_pte_rmapp
...
Diffstat (limited to 'arch')
104 files changed, 7575 insertions, 4791 deletions
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 54d181177656..ddbe6bf00e33 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -219,6 +219,23 @@ lr .req x30 // link register .endm /* + * @dst: destination register + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + .macro this_cpu_offset, dst + mrs \dst, tpidr_el2 + .endm +#else + .macro this_cpu_offset, dst +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + mrs \dst, tpidr_el1 +alternative_else + mrs \dst, tpidr_el2 +alternative_endif + .endm +#endif + + /* * @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP) * @sym: The name of the per-cpu variable * @tmp: scratch register @@ -226,11 +243,7 @@ lr .req x30 // link register .macro adr_this_cpu, dst, sym, tmp adrp \tmp, \sym add \dst, \tmp, #:lo12:\sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp add \dst, \dst, \tmp .endm @@ -241,11 +254,7 @@ alternative_endif */ .macro ldr_this_cpu dst, sym, tmp adr_l \dst, \sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp ldr \dst, [\dst, \tmp] .endm diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h new file mode 100644 index 000000000000..daa1a1da539e --- /dev/null +++ b/arch/arm64/include/asm/hyp_image.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil <dbrazdil@google.com> + */ + +#ifndef __ARM64_HYP_IMAGE_H__ +#define __ARM64_HYP_IMAGE_H__ + +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, + * to separate it from the kernel proper. + */ +#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym + +#ifdef LINKER_SCRIPT + +/* + * KVM nVHE ELF section names are prefixed with .hyp, to separate them + * from the kernel proper. + */ +#define HYP_SECTION_NAME(NAME) .hyp##NAME + +/* Defines an ELF hyp section from input section @NAME and its subsections. */ +#define HYP_SECTION(NAME) \ + HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) } + +/* + * Defines a linker script alias of a kernel-proper symbol referenced by + * KVM nVHE hyp code. + */ +#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym; + +#endif /* LINKER_SCRIPT */ + +#endif /* __ARM64_HYP_IMAGE_H__ */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7f7072f6cb45..54387ccd1ab2 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -7,6 +7,7 @@ #ifndef __ARM_KVM_ASM_H__ #define __ARM_KVM_ASM_H__ +#include <asm/hyp_image.h> #include <asm/virt.h> #define ARM_EXIT_WITH_SERROR_BIT 31 @@ -35,17 +36,34 @@ #define __SMCCC_WORKAROUND_1_SMC_SZ 36 +#define KVM_HOST_SMCCC_ID(id) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + (id)) + +#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) + +#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 +#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 +#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 +#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 + #ifndef __ASSEMBLY__ #include <linux/mm.h> -/* - * Translate name of a symbol defined in nVHE hyp to the name seen - * by kernel proper. All nVHE symbols are prefixed by the build system - * to avoid clashes with the VHE variants. - */ -#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym - #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] @@ -57,10 +75,53 @@ DECLARE_KVM_VHE_SYM(sym); \ DECLARE_KVM_NVHE_SYM(sym) +#define DECLARE_KVM_VHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, sym) +#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, kvm_nvhe_sym(sym)) + +#define DECLARE_KVM_HYP_PER_CPU(type, sym) \ + DECLARE_KVM_VHE_PER_CPU(type, sym); \ + DECLARE_KVM_NVHE_PER_CPU(type, sym) + +/* + * Compute pointer to a symbol defined in nVHE percpu region. + * Returns NULL if percpu memory has not been allocated yet. + */ +#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id()) +#define per_cpu_ptr_nvhe_sym(sym, cpu) \ + ({ \ + unsigned long base, off; \ + base = kvm_arm_hyp_percpu_base[cpu]; \ + off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ + (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ + base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ + }) + +#if defined(__KVM_NVHE_HYPERVISOR__) + +#define CHOOSE_NVHE_SYM(sym) sym +#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym) + +/* The nVHE hypervisor shouldn't even try to access VHE symbols */ +extern void *__nvhe_undefined_symbol; +#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol +#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol) +#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol) + +#elif defined(__KVM_VHE_HYPERVISOR__) + #define CHOOSE_VHE_SYM(sym) sym -#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) +#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym) + +/* The VHE hypervisor shouldn't even try to access nVHE symbols */ +extern void *__vhe_undefined_symbol; +#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol +#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol) +#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol) + +#else -#ifndef __KVM_NVHE_HYPERVISOR__ /* * BIG FAT WARNINGS: * @@ -72,12 +133,21 @@ * - Don't let the nVHE hypervisor have access to this, as it will * pick the *wrong* symbol (yes, it runs at EL2...). */ -#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \ +#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \ + ? CHOOSE_VHE_SYM(sym) \ : CHOOSE_NVHE_SYM(sym)) -#else -/* The nVHE hypervisor shouldn't even try to access anything */ -extern void *__nvhe_undefined_symbol; -#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol + +#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \ + ? this_cpu_ptr(&sym) \ + : this_cpu_ptr_nvhe_sym(sym)) + +#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \ + ? per_cpu_ptr(&sym, cpu) \ + : per_cpu_ptr_nvhe_sym(sym, cpu)) + +#define CHOOSE_VHE_SYM(sym) sym +#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) + #endif /* Translate a kernel address @ptr into its equivalent linear mapping */ @@ -95,10 +165,16 @@ struct kvm_vcpu; struct kvm_s2_mmu; DECLARE_KVM_NVHE_SYM(__kvm_hyp_init); +DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector); DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) +#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) +extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DECLARE_KVM_NVHE_SYM(__per_cpu_start); +DECLARE_KVM_NVHE_SYM(__per_cpu_end); + extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) @@ -144,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; addr; \ }) -/* - * Home-grown __this_cpu_{ptr,read} variants that always work at HYP, - * provided that sym is really a *symbol* and not a pointer obtained from - * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps - * sparse quiet. - */ -#define __hyp_this_cpu_ptr(sym) \ - ({ \ - void *__ptr; \ - __verify_pcpu_ptr(&sym); \ - __ptr = hyp_symbol_addr(sym); \ - __ptr += read_sysreg(tpidr_el2); \ - (typeof(sym) __kernel __force *)__ptr; \ - }) - -#define __hyp_this_cpu_read(sym) \ - ({ \ - *__hyp_this_cpu_ptr(sym); \ - }) - #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ " .align 3\n" \ @@ -194,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; #else /* __ASSEMBLY__ */ -.macro hyp_adr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - add \reg, \reg, \tmp -.endm - -.macro hyp_ldr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - ldr \reg, [\reg, \tmp] -.endm - .macro get_host_ctxt reg, tmp - hyp_adr_this_cpu \reg, kvm_host_data, \tmp + adr_this_cpu \reg, kvm_host_data, \tmp add \reg, \reg, #HOST_DATA_CONTEXT .endm @@ -216,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] .endm +.macro get_loaded_vcpu vcpu, ctxt + adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu + ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + +.macro set_loaded_vcpu vcpu, ctxt, tmp + adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp + str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + /* * KVM extable for unexpected exceptions. * In the same format _asm_extable, but output to a different section so that @@ -231,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; .popsection .endm +#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) +#define CPU_LR_OFFSET CPU_XREG_OFFSET(30) +#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8) + +/* + * We treat x18 as callee-saved as the host may use it as a platform + * register (e.g. for shadow call stack). + */ +.macro save_callee_saved_regs ctxt + str x18, [\ctxt, #CPU_XREG_OFFSET(18)] + stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro restore_callee_saved_regs ctxt + // We require \ctxt is not x18-x28 + ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] + ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro save_sp_el0 ctxt, tmp + mrs \tmp, sp_el0 + str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] +.endm + +.macro restore_sp_el0 ctxt, tmp + ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] + msr sp_el0, \tmp +.endm + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bb5e5b88d439..0aecbab6a7fb 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -11,6 +11,7 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include <linux/arm-smccc.h> #include <linux/bitmap.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -79,8 +80,8 @@ struct kvm_s2_mmu { * for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the * canonical stage-2 page tables. */ - pgd_t *pgd; phys_addr_t pgd_phys; + struct kvm_pgtable *pgt; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -110,6 +111,13 @@ struct kvm_arch { * supported. */ bool return_nisv_io_abort_to_user; + + /* + * VM-wide PMU filter, implemented as a bitmap and big enough for + * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). + */ + unsigned long *pmu_filter; + unsigned int pmuver; }; struct kvm_vcpu_fault_info { @@ -262,8 +270,6 @@ struct kvm_host_data { struct kvm_pmu_events pmu_events; }; -typedef struct kvm_host_data kvm_host_data_t; - struct vcpu_reset_state { unsigned long pc; unsigned long r0; @@ -480,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -u64 __kvm_call_hyp(void *hypfn, ...); - -#define kvm_call_hyp_nvhe(f, ...) \ - do { \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ - } while(0) - -#define kvm_call_hyp_nvhe_ret(f, ...) \ +#define kvm_call_hyp_nvhe(f, ...) \ ({ \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ + struct arm_smccc_res res; \ + \ + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ + ##__VA_ARGS__, &res); \ + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ + \ + res.a1; \ }) /* @@ -517,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...); ret = f(__VA_ARGS__); \ isb(); \ } else { \ - ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \ + ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ \ ret; \ @@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 46689e7db46c..6b664de5ec1f 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -12,6 +12,9 @@ #include <asm/alternative.h> #include <asm/sysreg.h> +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ @@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); void deactivate_traps_vhe_put(void); #endif -u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); +u64 __guest_enter(struct kvm_vcpu *vcpu); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt); +void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(unsigned long, ...); +void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index cff1cebc7590..331394306cce 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -44,16 +44,6 @@ * HYP_VA_MIN = 1 << (VA_BITS - 1) * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 * - * This of course assumes that the trampoline page exists within the - * VA_BITS range. If it doesn't, then it means we're in the odd case - * where the kernel idmap (as well as HYP) uses more levels than the - * kernel runtime page tables (as seen when the kernel is configured - * for 4k pages, 39bits VA, and yet memory lives just above that - * limit, forcing the idmap to use 4 levels of page tables while the - * kernel itself only uses 3). In this particular case, it doesn't - * matter which side of VA_BITS we use, as we're guaranteed not to - * conflict with anything. - * * When using VHE, there are no separate hyp mappings and all KVM * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. @@ -118,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - +#include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -int create_hyp_mappings(void *from, void *to, pgprot_t prot); +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); @@ -142,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); - phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); -void kvm_clear_hyp_idmap(void); - -#define kvm_mk_pmd(ptep) \ - __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) -#define kvm_mk_pud(pmdp) \ - __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_p4d(pmdp) \ - __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) - -#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) - -#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) -#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) -#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) - -#define kvm_pud_pfn(pud) pud_pfn(pud) - -#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) -#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) - -static inline pte_t kvm_s2pte_mkwrite(pte_t pte) -{ - pte_val(pte) |= PTE_S2_RDWR; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) -{ - pmd_val(pmd) |= PMD_S2_RDWR; - return pmd; -} - -static inline pud_t kvm_s2pud_mkwrite(pud_t pud) -{ - pud_val(pud) |= PUD_S2_RDWR; - return pud; -} - -static inline pte_t kvm_s2pte_mkexec(pte_t pte) -{ - pte_val(pte) &= ~PTE_S2_XN; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) -{ - pmd_val(pmd) &= ~PMD_S2_XN; - return pmd; -} - -static inline pud_t kvm_s2pud_mkexec(pud_t pud) -{ - pud_val(pud) &= ~PUD_S2_XN; - return pud; -} - -static inline void kvm_set_s2pte_readonly(pte_t *ptep) -{ - pteval_t old_pteval, pteval; - - pteval = READ_ONCE(pte_val(*ptep)); - do { - old_pteval = pteval; - pteval &= ~PTE_S2_RDWR; - pteval |= PTE_S2_RDONLY; - pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); - } while (pteval != old_pteval); -} - -static inline bool kvm_s2pte_readonly(pte_t *ptep) -{ - return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY; -} - -static inline bool kvm_s2pte_exec(pte_t *ptep) -{ - return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN); -} - -static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp) -{ - kvm_set_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_readonly(pmd_t *pmdp) -{ - return kvm_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_exec(pmd_t *pmdp) -{ - return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); -} - -static inline void kvm_set_s2pud_readonly(pud_t *pudp) -{ - kvm_set_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_readonly(pud_t *pudp) -{ - return kvm_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_exec(pud_t *pudp) -{ - return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); -} - -static inline pud_t kvm_s2pud_mkyoung(pud_t pud) -{ - return pud_mkyoung(pud); -} - -static inline bool kvm_s2pud_young(pud_t pud) -{ - return pud_young(pud); -} - -#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) - -#ifdef __PAGETABLE_PMD_FOLDED -#define hyp_pmd_table_empty(pmdp) (0) -#else -#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#endif - -#ifdef __PAGETABLE_PUD_FOLDED -#define hyp_pud_table_empty(pudp) (0) -#else -#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) -#endif - -#ifdef __PAGETABLE_P4D_FOLDED -#define hyp_p4d_table_empty(p4dp) (0) -#else -#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) -#endif struct kvm; @@ -326,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, } } -static inline void __kvm_flush_dcache_pte(pte_t pte) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pte_page(pte); - kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); - } -} - -static inline void __kvm_flush_dcache_pmd(pmd_t pmd) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pmd_page(pmd); - kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); - } -} - -static inline void __kvm_flush_dcache_pud(pud_t pud) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pud_page(pud); - kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); - } -} - void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); -static inline bool __kvm_cpu_uses_extended_idmap(void) -{ - return __cpu_uses_extended_idmap_level(); -} - -static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) -{ - return idmap_ptrs_per_pgd; -} - -/* - * Can't use pgd_populate here, because the extended idmap adds an extra level - * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended - * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4. - */ -static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, - pgd_t *hyp_pgd, - pgd_t *merged_hyp_pgd, - unsigned long hyp_idmap_start) -{ - int idmap_idx; - u64 pgd_addr; - - /* - * Use the first entry to access the HYP mappings. It is - * guaranteed to be free, otherwise we wouldn't use an - * extended idmap. - */ - VM_BUG_ON(pgd_val(merged_hyp_pgd[0])); - pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd)); - merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE); - - /* - * Create another extended level entry that points to the boot HYP map, - * which contains an ID mapping of the HYP init code. We essentially - * merge the boot and runtime HYP maps by doing so, but they don't - * overlap anyway, so this is fine. - */ - idmap_idx = hyp_idmap_start >> VA_BITS; - VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx])); - pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd)); - merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE); -} - static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); @@ -479,30 +256,6 @@ static inline void *kvm_get_hyp_vector(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) -/* - * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. - * With v8.2 LVA extensions, 'x' should be a minimum of 6 with - * 52bit IPS. - */ -static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) -{ - int x = ARM64_VTTBR_X(ipa_shift, levels); - - return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; -} - -static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) -{ - unsigned int x = arm64_vttbr_x(ipa_shift, levels); - - return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); -} - -static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) -{ - return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); -} - static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h new file mode 100644 index 000000000000..52ab38db04c7 --- /dev/null +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + */ + +#ifndef __ARM64_KVM_PGTABLE_H__ +#define __ARM64_KVM_PGTABLE_H__ + +#include <linux/bits.h> +#include <linux/kvm_host.h> +#include <linux/types.h> + +typedef u64 kvm_pte_t; + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; +}; + +/** + * enum kvm_pgtable_prot - Page-table permissions and attributes. + * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_W: Write permission. + * @KVM_PGTABLE_PROT_R: Read permission. + * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + */ +enum kvm_pgtable_prot { + KVM_PGTABLE_PROT_X = BIT(0), + KVM_PGTABLE_PROT_W = BIT(1), + KVM_PGTABLE_PROT_R = BIT(2), + + KVM_PGTABLE_PROT_DEVICE = BIT(3), +}; + +#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) +#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) + +/** + * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. + * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid + * entries. + * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their + * children. + * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their + * children. + */ +enum kvm_pgtable_walk_flags { + KVM_PGTABLE_WALK_LEAF = BIT(0), + KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), + KVM_PGTABLE_WALK_TABLE_POST = BIT(2), +}; + +typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg); + +/** + * struct kvm_pgtable_walker - Hook into a page-table walk. + * @cb: Callback function to invoke during the walk. + * @arg: Argument passed to the callback function. + * @flags: Bitwise-OR of flags to identify the entry types on which to + * invoke the callback function. + */ +struct kvm_pgtable_walker { + const kvm_pgtable_visitor_fn_t cb; + void * const arg; + const enum kvm_pgtable_walk_flags flags; +}; + +/** + * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @va_bits: Maximum virtual address bits. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits); + +/** + * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); + +/** + * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * @addr: Virtual address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. Attempts to install a new mapping + * for a virtual address that is already mapped will be rejected with an + * error and a WARN(). + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot); + +/** + * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @kvm: KVM structure representing the guest virtual machine. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); + +/** + * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); + +/** + * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to + * allocate page-table pages. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. + * + * Note that this function will both coalesce existing table entries and split + * existing block mappings, relying on page-faults to fault back areas outside + * of the new mapping lazily. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc); + +/** + * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to remove the mapping. + * @size: Size of the mapping. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * TLB invalidation is performed for each page-table entry cleared during the + * unmapping operation and the reference count for the page-table page + * containing the cleared entry is decremented, with unreferenced pages being + * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if + * FWB is not supported by the CPU. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range + * without TLB invalidation. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to write-protect, + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * set the access flag in that entry. + * + * Return: The old page-table entry prior to setting the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * clear the access flag in that entry. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: The old page-table entry prior to clearing the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a + * page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * @prot: Additional permissions to grant for the mapping. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * relax the permissions in that entry according to the read, write and + * execute permissions specified by @prot. No permissions are removed, and + * TLB invalidation is performed after updating the entry. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot); + +/** + * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the + * access flag set. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * Return: True if the page-table entry has the access flag set, false otherwise. + */ +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point + * of Coherency for guest stage-2 address + * range. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to flush. + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_walk() - Walk a page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). + * @addr: Input address for the start of the walk. + * @size: Size of the range to walk. + * @walker: Walker callback description. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * The walker will walk the page-table entries corresponding to the input + * address range specified, visiting entries according to the walker flags. + * Invalid entries are treated as leaf entries. Leaf entries are reloaded + * after invoking the walker callback, allowing the walker to descend into + * a newly installed table. + * + * Returning a negative error code from the walker callback function will + * terminate the walk immediately with the same error code. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker); + +#endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index 0ddf98c3ba9f..0cd0965255d2 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -60,7 +60,7 @@ .endm /* - * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will + * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will * check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as * (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and * then proceed ahead with the save/restore of Pointer Authentication @@ -78,7 +78,7 @@ alternative_else_nop_endif .L__skip_switch\@: .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 alternative_if_not ARM64_HAS_ADDRESS_AUTH b .L__skip_switch\@ alternative_else_nop_endif @@ -96,7 +96,7 @@ alternative_else_nop_endif #else /* !CONFIG_ARM64_PTR_AUTH */ .macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3 .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 0b6409b89e5e..1599e17379d8 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off) :: "r" (off) : "memory"); } -static inline unsigned long __my_cpu_offset(void) +static inline unsigned long __hyp_my_cpu_offset(void) +{ + /* + * Non-VHE hyp code runs with preemption disabled. No need to hazard + * the register access against barrier() as in __kern_my_cpu_offset. + */ + return read_sysreg(tpidr_el2); +} + +static inline unsigned long __kern_my_cpu_offset(void) { unsigned long off; @@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void) return off; } -#define __my_cpu_offset __my_cpu_offset() + +#ifdef __KVM_NVHE_HYPERVISOR__ +#define __my_cpu_offset __hyp_my_cpu_offset() +#else +#define __my_cpu_offset __kern_my_cpu_offset() +#endif #define PERCPU_RW_OPS(sz) \ static inline unsigned long __percpu_read_##sz(void *ptr) \ @@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd) #include <asm-generic/percpu.h> +/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */ +#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT) +#undef this_cpu_ptr +#define this_cpu_ptr raw_cpu_ptr +#undef __this_cpu_read +#define __this_cpu_read raw_cpu_read +#undef __this_cpu_write +#define __this_cpu_write raw_cpu_write +#endif + #endif /* __ASM_PERCPU_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 94b3f2ac2e9d..01a96d07ae74 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -146,7 +146,6 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ -#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #ifdef CONFIG_ARM64_PA_BITS_52 @@ -163,34 +162,11 @@ #define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2) /* - * 2nd stage PTE definitions - */ -#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ -#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ -#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ -#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */ - -#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ -#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ -#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ -#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */ - -#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ -#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ -#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ - -/* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) /* - * EL2/HYP PTE/PMD definitions - */ -#define PMD_HYP PMD_SECT_USER -#define PTE_HYP PTE_USER - -/* * Highest possible physical address supported. */ #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 4cd0d6ca8aa1..046be789fbb4 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -64,7 +64,6 @@ extern bool arm64_use_ng_mappings; #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) -#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT #define PAGE_KERNEL __pgprot(PROT_NORMAL) #define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) @@ -73,11 +72,6 @@ extern bool arm64_use_ng_mappings; #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) #define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) -#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) -#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN) - #define PAGE_S2_MEMATTR(attr) \ ({ \ u64 __val; \ @@ -88,19 +82,6 @@ extern bool arm64_use_ng_mappings; __val; \ }) -#define PAGE_S2_XN \ - ({ \ - u64 __val; \ - if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \ - __val = 0; \ - else \ - __val = PTE_S2_XN; \ - __val; \ - }) - -#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) -#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) - #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 996bf98f0cab..fe341a6578c3 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -8,7 +8,6 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ -#include <linux/hugetlb.h> #include <linux/pgtable.h> /* @@ -37,217 +36,12 @@ #define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* - * The number of PTRS across all concatenated stage2 tables given by the - * number of bits resolved at the initial level. - * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), - * in which case, stage2_pgd_ptrs will have one entry. - */ -#define pgd_ptrs_shift(ipa, pgdir_shift) \ - ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) -#define __s2_pgd_ptrs(ipa, lvls) \ - (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) -#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) - -#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) -#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) - -/* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at * the VM creation. */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -/* Stage2 PUD definitions when the level is present */ -static inline bool kvm_stage2_has_pud(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); -} - -#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) - -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) -#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) - -static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) -{ - return p4d_offset(pgd, address); -} - -static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) -{ -} - -static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) -{ - return false; -} - -static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, - phys_addr_t addr, phys_addr_t end) -{ - return end; -} - -static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_none(p4d); - else - return 0; -} - -static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_clear(p4dp); -} - -static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_present(p4d); - else - return 1; -} - -static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_populate(NULL, p4d, pud); -} - -static inline pud_t *stage2_pud_offset(struct kvm *kvm, - p4d_t *p4d, unsigned long address) -{ - if (kvm_stage2_has_pud(kvm)) - return pud_offset(p4d, address); - else - return (pud_t *)p4d; -} - -static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - free_page((unsigned long)pud); -} - -static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) -{ - if (kvm_stage2_has_pud(kvm)) - return kvm_page_empty(pudp); - else - return false; -} - -static inline phys_addr_t -stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pud(kvm)) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -/* Stage2 PMD definitions when the level is present */ -static inline bool kvm_stage2_has_pmd(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); -} - -#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) - -static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_none(pud); - else - return 0; -} - -static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_clear(pud); -} - -static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_present(pud); - else - return 1; -} - -static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_populate(NULL, pud, pmd); -} - -static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, - pud_t *pud, unsigned long address) -{ - if (kvm_stage2_has_pmd(kvm)) - return pmd_offset(pud, address); - else - return (pmd_t *)pud; -} - -static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - free_page((unsigned long)pmd); -} - -static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_huge(pud); - else - return 0; -} - -static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) -{ - if (kvm_stage2_has_pmd(kvm)) - return kvm_page_empty(pmdp); - else - return 0; -} - -static inline phys_addr_t -stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pmd(kvm)) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) -{ - return kvm_page_empty(ptep); -} - -static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) -{ - return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); -} - static inline phys_addr_t stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { @@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) return (boundary - 1 < end - 1) ? boundary : end; } -/* - * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and - * the architectural page-table level. - */ -#define S2_NO_LEVEL_HINT 0 -#define S2_PUD_LEVEL 1 -#define S2_PMD_LEVEL 2 -#define S2_PTE_LEVEL 3 - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 7d804fd0a682..1c17c3a24411 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -159,6 +159,21 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* + * PMU filter structure. Describe a range of events with a particular + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. + */ +struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + __u8 action; + __u8 pad[3]; +}; + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { @@ -338,6 +353,7 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 843ecfb16a69..61684a500914 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -61,14 +61,11 @@ __efistub__ctype = _ctype; * memory mappings. */ -#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym; - /* Alternative callbacks for init-time patching of nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ -KVM_NVHE_ALIAS(kvm_host_data); KVM_NVHE_ALIAS(kvm_vgic_global_state); /* Kernel constant needed to compute idmap addresses. */ diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 5ca957e656ab..6d78c041fdf6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -10,6 +10,7 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> +#include <asm/hyp_image.h> #include <asm/kernel-pgtable.h> #include <asm/memory.h> #include <asm/page.h> @@ -22,12 +23,23 @@ ENTRY(_text) jiffies = jiffies_64; +#ifdef CONFIG_KVM #define HYPERVISOR_EXTABLE \ . = ALIGN(SZ_8); \ __start___kvm_ex_table = .; \ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#define HYPERVISOR_PERCPU_SECTION +#endif + #define HYPERVISOR_TEXT \ /* \ * Align to 4 KB so that \ @@ -196,6 +208,7 @@ SECTIONS } PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION .rela.dyn : ALIGN(8) { *(.rela .rela*) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 99977c1972cc..1504c81fbf5d 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ $(KVM)/vfio.o $(KVM)/irqchip.o \ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ - inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \ + inject_fault.o regmap.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ aarch32.o arch_timer.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index acf9a993dfb6..f56122eedffc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,8 +46,10 @@ __asm__(".arch_extension virt"); #endif -DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); + static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); @@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + bitmap_free(kvm->arch.pmu_filter); + kvm_vgic_destroy(kvm); for (i = 0; i < KVM_MAX_VCPUS; ++i) { @@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_caches(vcpu); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); @@ -1259,6 +1263,19 @@ long kvm_arch_vm_ioctl(struct file *filp, } } +static unsigned long nvhe_percpu_size(void) +{ + return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - + (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); +} + +static unsigned long nvhe_percpu_order(void) +{ + unsigned long size = nvhe_percpu_size(); + + return size ? get_order(size) : 0; +} + static int kvm_map_vectors(void) { /* @@ -1299,6 +1316,7 @@ static void cpu_init_hyp_mode(void) unsigned long hyp_stack_ptr; unsigned long vector_ptr; unsigned long tpidr_el2; + struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); @@ -1308,12 +1326,13 @@ static void cpu_init_hyp_mode(void) * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) - - (unsigned long)kvm_ksym_ref(&kvm_host_data)); + tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_get_hyp_vector(); + hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); + vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); /* * Call initialization code, and switch to the full blown HYP code. @@ -1322,7 +1341,9 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), + pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res); + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS @@ -1342,10 +1363,12 @@ static void cpu_hyp_reset(void) static void cpu_hyp_reinit(void) { - kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); + kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); cpu_hyp_reset(); + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector(); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); else @@ -1496,8 +1519,10 @@ static void teardown_hyp_mode(void) int cpu; free_hyp_pgds(); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + } } /** @@ -1531,6 +1556,24 @@ static int init_hyp_mode(void) } /* + * Allocate and initialize pages for Hypervisor-mode percpu regions. + */ + for_each_possible_cpu(cpu) { + struct page *page; + void *page_addr; + + page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); + if (!page) { + err = -ENOMEM; + goto out_err; + } + + page_addr = page_address(page); + memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); + kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + } + + /* * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), @@ -1574,14 +1617,17 @@ static int init_hyp_mode(void) } } + /* + * Map Hyp percpu pages + */ for_each_possible_cpu(cpu) { - kvm_host_data_t *cpu_data; + char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_end = percpu_begin + nvhe_percpu_size(); - cpu_data = per_cpu_ptr(&kvm_host_data, cpu); - err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); + err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); if (err) { - kvm_err("Cannot map host CPU state: %d\n", err); + kvm_err("Cannot map hyp percpu region\n"); goto out_err; } } diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S deleted file mode 100644 index 3c79a1124af2..000000000000 --- a/arch/arm64/kvm/hyp.S +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <linux/linkage.h> - -#include <asm/alternative.h> -#include <asm/assembler.h> -#include <asm/cpufeature.h> - -/* - * u64 __kvm_call_hyp(void *hypfn, ...); - * - * This is not really a variadic function in the classic C-way and care must - * be taken when calling this to ensure parameters are passed in registers - * only, since the stack will change between the caller and the callee. - * - * Call the function with the first argument containing a pointer to the - * function you wish to call in Hyp mode, and subsequent arguments will be - * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the - * function pointer can be passed). The function being called must be mapped - * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are - * passed in x0. - * - * A function pointer with a value less than 0xfff has a special meaning, - * and is used to implement hyp stubs in the same way as in - * arch/arm64/kernel/hyp_stub.S. - */ -SYM_FUNC_START(__kvm_call_hyp) - hvc #0 - ret -SYM_FUNC_END(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index d898f0da5802..4a81eddabcd8 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 76e7eaf4675e..b0afad7a99c6 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <asm/alternative.h> -#include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/fpsimdmacros.h> #include <asm/kvm.h> @@ -16,66 +15,28 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_ptrauth.h> -#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) -#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8) - .text /* - * We treat x18 as callee-saved as the host may use it as a platform - * register (e.g. for shadow call stack). - */ -.macro save_callee_saved_regs ctxt - str x18, [\ctxt, #CPU_XREG_OFFSET(18)] - stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro restore_callee_saved_regs ctxt - // We require \ctxt is not x18-x28 - ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] - ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro save_sp_el0 ctxt, tmp - mrs \tmp, sp_el0 - str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] -.endm - -.macro restore_sp_el0 ctxt, tmp - ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] - msr sp_el0, \tmp -.endm - -/* - * u64 __guest_enter(struct kvm_vcpu *vcpu, - * struct kvm_cpu_context *host_ctxt); + * u64 __guest_enter(struct kvm_vcpu *vcpu); */ SYM_FUNC_START(__guest_enter) // x0: vcpu - // x1: host context - // x2-x17: clobbered by macros + // x1-x17: clobbered by macros // x29: guest context - // Store the host regs + adr_this_cpu x1, kvm_hyp_ctxt, x2 + + // Store the hyp regs save_callee_saved_regs x1 - // Save the host's sp_el0 + // Save hyp's sp_el0 save_sp_el0 x1, x2 - // Now the host state is stored if we have a pending RAS SError it must - // affect the host. If any asynchronous exception is pending we defer - // the guest entry. The DSB isn't necessary before v8.2 as any SError - // would be fatal. + // Now the hyp state is stored if we have a pending RAS SError it must + // affect the host or hyp. If any asynchronous exception is pending we + // defer the guest entry. The DSB isn't necessary before v8.2 as any + // SError would be fatal. alternative_if ARM64_HAS_RAS_EXTN dsb nshst isb @@ -86,6 +47,8 @@ alternative_else_nop_endif ret 1: + set_loaded_vcpu x0, x1, x2 + add x29, x0, #VCPU_CONTEXT // Macro ptrauth_switch_to_guest format: @@ -116,6 +79,26 @@ alternative_else_nop_endif eret sb +SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + // If the hyp context is loaded, go straight to hyp_panic + get_loaded_vcpu x0, x1 + cbz x0, hyp_panic + + // The hyp context is saved so make sure it is restored to allow + // hyp_panic to run at hyp and, subsequently, panic to run in the host. + // This makes use of __guest_exit to avoid duplication but sets the + // return address to tail call into hyp_panic. As a side effect, the + // current state is saved to the guest context but it will only be + // accurate if the guest had been completely restored. + adr_this_cpu x0, kvm_hyp_ctxt, x1 + adr x1, hyp_panic + str x1, [x0, #CPU_XREG_OFFSET(30)] + + get_vcpu_ptr x1, x0 + SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x0: return code // x1: vcpu @@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Store the guest's sp_el0 save_sp_el0 x1, x2 - get_host_ctxt x2, x3 + adr_this_cpu x2, kvm_hyp_ctxt, x3 - // Macro ptrauth_switch_to_guest format: - // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3) + // Macro ptrauth_switch_to_hyp format: + // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3) // The below macro to save/restore keys is not implemented in C code // as it may cause Pointer Authentication key signing mismatch errors // when this feature is enabled for kernel code. - ptrauth_switch_to_host x1, x2, x3, x4, x5 + ptrauth_switch_to_hyp x1, x2, x3, x4, x5 - // Restore the hosts's sp_el0 + // Restore hyp's sp_el0 restore_sp_el0 x2, x3 - // Now restore the host regs + // Now restore the hyp regs restore_callee_saved_regs x2 + set_loaded_vcpu xzr, x1, x2 + alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error // without an unmask-SError and isb. The ESB-instruction consumed any diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 7ea277b82967..0a5b36eb54b3 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -12,7 +12,6 @@ #include <asm/cpufeature.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> -#include <asm/kvm_mmu.h> #include <asm/mmu.h> .macro save_caller_saved_regs_vect @@ -41,20 +40,6 @@ .text -.macro do_el2_call - /* - * Shuffle the parameters before calling the function - * pointed to in x0. Assumes parameters in x[1,2,3]. - */ - str lr, [sp, #-16]! - mov lr, x0 - mov x0, x1 - mov x1, x2 - mov x2, x3 - blr lr - ldr lr, [sp], #16 -.endm - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 @@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap -#ifdef __KVM_NVHE_HYPERVISOR__ - mrs x1, vttbr_el2 // If vttbr is valid, the guest - cbnz x1, el1_hvc_guest // called HVC - - /* Here, we're pretty sure the host called HVC. */ - ldp x0, x1, [sp], #16 - - /* Check for a stub HVC call */ - cmp x0, #HVC_STUB_HCALL_NR - b.hs 1f - - /* - * Compute the idmap address of __kvm_handle_stub_hvc and - * jump there. Since we use kimage_voffset, do not use the - * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead - * (by loading it from the constant pool). - * - * Preserve x0-x4, which may contain stub parameters. - */ - ldr x5, =__kvm_handle_stub_hvc - ldr_l x6, kimage_voffset - - /* x5 = __pa(x5) */ - sub x5, x5, x6 - br x5 - -1: - /* - * Perform the EL2 call - */ - kern_hyp_va x0 - do_el2_call - - eret - sb -#endif /* __KVM_NVHE_HYPERVISOR__ */ - -el1_hvc_guest: /* * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. * The workaround has already been applied on the host, @@ -169,24 +116,7 @@ el2_error: eret sb -#ifdef __KVM_NVHE_HYPERVISOR__ -SYM_FUNC_START(__hyp_do_panic) - mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, lr - ldr lr, =panic - msr elr_el2, lr - eret - sb -SYM_FUNC_END(__hyp_do_panic) -#endif - -SYM_CODE_START(__hyp_panic) - get_host_ctxt x0, x1 - b hyp_panic -SYM_CODE_END(__hyp_panic) - -.macro invalid_vector label, target = __hyp_panic +.macro invalid_vector label, target = __guest_exit_panic .align 2 SYM_CODE_START(\label) b \target @@ -198,7 +128,6 @@ SYM_CODE_END(\label) invalid_vector el2t_irq_invalid invalid_vector el2t_fiq_invalid invalid_vector el2t_error_invalid - invalid_vector el2h_sync_invalid invalid_vector el2h_irq_invalid invalid_vector el2h_fiq_invalid invalid_vector el1_fiq_invalid @@ -228,10 +157,9 @@ check_preamble_length 661b, 662b .macro invalid_vect target .align 7 661: - b \target nop + stp x0, x1, [sp, #-16]! 662: - ldp x0, x1, [sp], #16 b \target check_preamble_length 661b, 662b diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 5e28ea6aa097..4ebe9f558f3a 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); @@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d0f07e8cc3ff..313a8fa3c721 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline void __activate_vm(struct kvm_s2_mmu *mmu) -{ - __load_guest_stage2(mmu); -} - static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { u64 par, tmp; @@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr) ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ } while(0) +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *ctxt; @@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) return false; - ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + ctxt = this_cpu_ptr(&kvm_hyp_ctxt); __ptrauth_save_key(ctxt, APIA); __ptrauth_save_key(ctxt, APIB); __ptrauth_save_key(ctxt, APDA); @@ -481,14 +478,13 @@ exit: static inline void __kvm_unexpected_el2_exception(void) { + extern char __guest_exit_panic[]; unsigned long addr, fixup; - struct kvm_cpu_context *host_ctxt; struct exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); entry = hyp_symbol_addr(__start___kvm_ex_table); end = hyp_symbol_addr(__stop___kvm_ex_table); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; while (entry < end) { addr = (unsigned long)&entry->insn + entry->insn; @@ -503,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void) return; } - hyp_panic(host_ctxt); + /* Trigger a panic after restoring the hyp context. */ + write_sysreg(__guest_exit_panic, elr_el2); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore new file mode 100644 index 000000000000..695d73d0249e --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hyp.lds diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index aef76487edc2..ddde15fe85f2 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,44 +6,50 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o -obj-y := $(patsubst %.o,%.hyp.o,$(obj-y)) -extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y)) +## +## Build rules for compiling nVHE hyp code +## Output of this folder is `kvm_nvhe.o`, a partially linked object +## file containing all nVHE hyp code and data. +## -$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE +hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) +obj-y := kvm_nvhe.o +extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds + +# 1) Compile all source files to `.nvhe.o` object files. The file extension +# avoids file name clashes for files shared with VHE. +$(obj)/%.nvhe.o: $(src)/%.c FORCE $(call if_changed_rule,cc_o_c) -$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE +$(obj)/%.nvhe.o: $(src)/%.S FORCE $(call if_changed_rule,as_o_S) -$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE - $(call if_changed,hypcopy) -# Disable reordering functions by GCC (enabled at -O2). -# This pass puts functions into '.text.*' sections to aid the linker -# in optimizing ELF layout. See HYPCOPY comment below for more info. -ccflags-y += $(call cc-option,-fno-reorder-functions) +# 2) Compile linker script. +$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + +# 3) Partially link all '.nvhe.o' files and apply the linker script. +# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. +# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before +# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to +# keep the dependency on the target while avoiding an error from +# GNU ld if the linker script is passed to it twice. +LDFLAGS_kvm_nvhe.tmp.o := -r -T +$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE + $(call if_changed,ld) + +# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# Prefixes names of ELF symbols with '__kvm_nvhe_'. +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE + $(call if_changed,hypcopy) # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names -# and relevant ELF section names to avoid clashes with VHE code/data. -# -# Hyp code is assumed to be in the '.text' section of the input object -# files (with the exception of specialized sections such as -# '.hyp.idmap.text'). This assumption may be broken by a compiler that -# divides code into sections like '.text.unlikely' so as to optimize -# ELF layout. HYPCOPY checks that no such sections exist in the input -# using `objdump`, otherwise they would be linked together with other -# kernel code and not memory-mapped correctly at runtime. +# to avoid clashes with VHE code/data. quiet_cmd_hypcopy = HYPCOPY $@ - cmd_hypcopy = \ - if $(OBJDUMP) -h $< | grep -F '.text.'; then \ - echo "$@: function reordering not supported in nVHE hyp code" >&2; \ - /bin/false; \ - fi; \ - $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \ - --rename-section=.text=.hyp.text \ - $< $@ + cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ # Remove ftrace and Shadow Call Stack CFLAGS. # This is equivalent to the 'notrace' and '__noscs' annotations. diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S new file mode 100644 index 000000000000..ff9a0f547b9f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <linux/linkage.h> + +#include <asm/assembler.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + + .text + +SYM_FUNC_START(__host_exit) + stp x0, x1, [sp, #-16]! + + get_host_ctxt x0, x1 + + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + + /* Store the host regs x2 and x3 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] + + /* Retrieve the host regs x0-x1 from the stack */ + ldp x2, x3, [sp], #16 // x0, x1 + + /* Store the host regs x0-x1 and x4-x17 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x0, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x0, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x0, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x0, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x0, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x0, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x0, #CPU_XREG_OFFSET(16)] + + /* Store the host regs x18-x29, lr */ + save_callee_saved_regs x0 + + /* Save the host context pointer in x29 across the function call */ + mov x29, x0 + bl handle_trap + + /* Restore host regs x0-x17 */ + ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + + /* x0-7 are use for panic arguments */ +__host_enter_for_panic: + ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)] + + /* Restore host regs x18-x29, lr */ + restore_callee_saved_regs x29 + + /* Do not touch any register after this! */ +__host_enter_without_restoring: + eret + sb +SYM_FUNC_END(__host_exit) + +/* + * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + */ +SYM_FUNC_START(__hyp_do_panic) + /* Load the format arguments into x1-7 */ + mov x6, x3 + get_vcpu_ptr x7, x3 + + mrs x3, esr_el2 + mrs x4, far_el2 + mrs x5, hpfar_el2 + + /* Prepare and exit to the host's panic funciton. */ + mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ + PSR_MODE_EL1h) + msr spsr_el2, lr + ldr lr, =panic + msr elr_el2, lr + + /* + * Set the panic format string and enter the host, conditionally + * restoring the host context. + */ + cmp x0, xzr + ldr x0, =__hyp_panic_string + b.eq __host_enter_without_restoring + b __host_enter_for_panic +SYM_FUNC_END(__hyp_do_panic) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + lsr x0, x0, #ESR_ELx_EC_SHIFT + cmp x0, #ESR_ELx_EC_HVC64 + ldp x0, x1, [sp], #16 + b.ne __host_exit + + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.hs __host_exit + + /* + * Compute the idmap address of __kvm_handle_stub_hvc and + * jump there. Since we use kimage_voffset, do not use the + * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead + * (by loading it from the constant pool). + * + * Preserve x0-x4, which may contain stub parameters. + */ + ldr x5, =__kvm_handle_stub_hvc + ldr_l x6, kimage_voffset + + /* x5 = __pa(x5) */ + sub x5, x5, x6 + br x5 +.L__vect_end\@: +.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) + .error "host_el1_sync_vect larger than vector entry" +.endif +.endm + +.macro invalid_host_el2_vect + .align 7 + /* If a guest is loaded, panic out of it. */ + stp x0, x1, [sp, #-16]! + get_loaded_vcpu x0, x1 + cbnz x0, __guest_exit_panic + add sp, sp, #16 + + /* + * The panic may not be clean if the exception is taken before the host + * context has been saved by __host_exit or after the hyp context has + * been partially clobbered by __host_enter. + */ + b hyp_panic +.endm + +.macro invalid_host_el1_vect + .align 7 + mov x0, xzr /* restore_host = false */ + mrs x1, spsr_el2 + mrs x2, elr_el2 + mrs x3, par_el1 + b __hyp_do_panic +.endm + +/* + * The host vector does not use an ESB instruction in order to avoid consuming + * SErrors that should only be consumed by the host. Guest entry is deferred by + * __guest_enter if there are any pending asynchronous exceptions so hyp will + * always return to the host without having consumerd host SErrors. + * + * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the + * host knows about the EL2 vectors already, and there is no point in hiding + * them. + */ + .align 11 +SYM_CODE_START(__kvm_hyp_host_vector) + invalid_host_el2_vect // Synchronous EL2t + invalid_host_el2_vect // IRQ EL2t + invalid_host_el2_vect // FIQ EL2t + invalid_host_el2_vect // Error EL2t + + invalid_host_el2_vect // Synchronous EL2h + invalid_host_el2_vect // IRQ EL2h + invalid_host_el2_vect // FIQ EL2h + invalid_host_el2_vect // Error EL2h + + host_el1_sync_vect // Synchronous 64-bit EL1 + invalid_host_el1_vect // IRQ 64-bit EL1 + invalid_host_el1_vect // FIQ 64-bit EL1 + invalid_host_el1_vect // Error 64-bit EL1 + + invalid_host_el1_vect // Synchronous 32-bit EL1 + invalid_host_el1_vect // IRQ 32-bit EL1 + invalid_host_el1_vect // FIQ 32-bit EL1 + invalid_host_el1_vect // Error 32-bit EL1 +SYM_CODE_END(__kvm_hyp_host_vector) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index d9434e90c06d..47224dc62c51 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -4,11 +4,13 @@ * Author: Marc Zyngier <marc.zyngier@arm.com> */ +#include <linux/arm-smccc.h> #include <linux/linkage.h> #include <asm/alternative.h> #include <asm/assembler.h> #include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> #include <asm/pgtable-hwdef.h> #include <asm/sysreg.h> @@ -44,27 +46,37 @@ __invalid: b . /* - * x0: HYP pgd - * x1: HYP stack - * x2: HYP vectors - * x3: per-CPU offset + * x0: SMCCC function ID + * x1: HYP pgd + * x2: per-CPU offset + * x3: HYP stack + * x4: HYP vectors */ __do_hyp_init: /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - phys_to_ttbr x4, x0 + /* Set tpidr_el2 for use by HYP to free a register */ + msr tpidr_el2, x2 + + mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) + cmp x0, x2 + b.eq 1f + mov x0, #SMCCC_RET_NOT_SUPPORTED + eret + +1: phys_to_ttbr x0, x1 alternative_if ARM64_HAS_CNP - orr x4, x4, #TTBR_CNP_BIT + orr x0, x0, #TTBR_CNP_BIT alternative_else_nop_endif - msr ttbr0_el2, x4 + msr ttbr0_el2, x0 - mrs x4, tcr_el1 - mov_q x5, TCR_EL2_MASK - and x4, x4, x5 - mov x5, #TCR_EL2_RES1 - orr x4, x4, x5 + mrs x0, tcr_el1 + mov_q x1, TCR_EL2_MASK + and x0, x0, x1 + mov x1, #TCR_EL2_RES1 + orr x0, x0, x1 /* * The ID map may be configured to use an extended virtual address @@ -80,18 +92,18 @@ alternative_else_nop_endif * * So use the same T0SZ value we use for the ID map. */ - ldr_l x5, idmap_t0sz - bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + ldr_l x1, idmap_t0sz + bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH /* * Set the PS bits in TCR_EL2. */ - tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6 + tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 - msr tcr_el2, x4 + msr tcr_el2, x0 - mrs x4, mair_el1 - msr mair_el2, x4 + mrs x0, mair_el1 + msr mair_el2, x0 isb /* Invalidate the stale TLBs from Bootloader */ @@ -103,25 +115,22 @@ alternative_else_nop_endif * as well as the EE bit on BE. Drop the A flag since the compiler * is allowed to generate unaligned accesses. */ - mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) -CPU_BE( orr x4, x4, #SCTLR_ELx_EE) + mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) +CPU_BE( orr x0, x0, #SCTLR_ELx_EE) alternative_if ARM64_HAS_ADDRESS_AUTH - mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) - orr x4, x4, x5 + orr x0, x0, x1 alternative_else_nop_endif - msr sctlr_el2, x4 + msr sctlr_el2, x0 isb /* Set the stack and new vectors */ - kern_hyp_va x1 - mov sp, x1 - msr vbar_el2, x2 - - /* Set tpidr_el2 for use by HYP */ - msr tpidr_el2, x3 + mov sp, x3 + msr vbar_el2, x4 /* Hello, World! */ + mov x0, #SMCCC_RET_SUCCESS eret SYM_CODE_END(__kvm_hyp_init) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c new file mode 100644 index 000000000000..e2eafe2c93af --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <hyp/switch.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#include <kvm/arm_hypercalls.h> + +static void handle_host_hcall(unsigned long func_id, + struct kvm_cpu_context *host_ctxt) +{ + unsigned long ret = 0; + + switch (func_id) { + case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1; + + ret = __kvm_vcpu_run(kern_hyp_va(vcpu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context): + __kvm_flush_vm_context(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + phys_addr_t ipa = host_ctxt->regs.regs[2]; + int level = host_ctxt->regs.regs[3]; + + __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): { + u64 cntvoff = host_ctxt->regs.regs[1]; + + __kvm_timer_set_cntvoff(cntvoff); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs): + __kvm_enable_ssbs(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2): + ret = __vgic_v3_get_ich_vtr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr): + ret = __vgic_v3_read_vmcr(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): { + u32 vmcr = host_ctxt->regs.regs[1]; + + __vgic_v3_write_vmcr(vmcr); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs): + __vgic_v3_init_lrs(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2): + ret = __kvm_get_mdcr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + break; + } + default: + /* Invalid host HVC. */ + host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED; + return; + } + + host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS; + host_ctxt->regs.regs[1] = ret; +} + +void handle_trap(struct kvm_cpu_context *host_ctxt) +{ + u64 esr = read_sysreg_el2(SYS_ESR); + unsigned long func_id; + + if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) + hyp_panic(); + + func_id = host_ctxt->regs.regs[0]; + handle_host_hcall(func_id, host_ctxt); +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S new file mode 100644 index 000000000000..bb2d986ff696 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil <dbrazdil@google.com> + * + * Linker script used for partial linking of nVHE EL2 object files. + */ + +#include <asm/hyp_image.h> +#include <asm-generic/vmlinux.lds.h> +#include <asm/cache.h> +#include <asm/memory.h> + +SECTIONS { + HYP_SECTION(.text) + HYP_SECTION_NAME(.data..percpu) : { + PERCPU_INPUT(L1_CACHE_BYTES) + } +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8d3dd4f47924..a457a0306e03 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -27,6 +27,11 @@ #include <asm/processor.h> #include <asm/thread_info.h> +/* Non-VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) } write_sysreg(val, cptr_el2); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; @@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { + extern char __kvm_hyp_host_vector[]; u64 mdcr_el2; ___deactivate_traps(vcpu); @@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); } -static void __deactivate_vm(struct kvm_vcpu *vcpu) +static void __load_host_stage2(void) { write_sysreg(0, vttbr_el2); } @@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmr_sync(); } - vcpu = kern_hyp_va(vcpu); - - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu)); + __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); @@ -204,7 +210,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); @@ -215,7 +221,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __hyp_vgic_save_state(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); @@ -235,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQOFF); + host_ctxt->__hyp_running_vcpu = NULL; + return exit_code; } -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu; - unsigned long str_va; + bool restore_host = true; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; - if (read_sysreg(vttbr_el2)) { + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (vcpu) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); } - /* - * Force the panic string to be loaded from the literal pool, - * making sure it is a kernel address and not a PC-relative - * reference. - */ - asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string)); - - __hyp_do_panic(str_va, - spsr, elr, - read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR), - read_sysreg(hpfar_el2), par, vcpu); + __hyp_do_panic(restore_host, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index b15d65a42042..39ca71ab8866 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -61,7 +61,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); /* @@ -115,7 +114,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalls12e1is); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c new file mode 100644 index 000000000000..0cdf6e461cbd --- /dev/null +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. + * No bombay mix was harmed in the writing of this file. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + */ + +#include <linux/bitfield.h> +#include <asm/kvm_pgtable.h> + +#define KVM_PGTABLE_MAX_LEVELS 4U + +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +struct kvm_pgtable_walk_data { + struct kvm_pgtable *pgt; + struct kvm_pgtable_walker *walker; + + u64 addr; + u64 end; +}; + +static u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +{ + u64 granule = kvm_granule_size(level); + + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1)) + return false; + + if (granule > (end - addr)) + return false; + + return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule); +} + +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +{ + u64 shift = kvm_granule_shift(level); + u64 mask = BIT(PAGE_SHIFT - 3) - 1; + + return (data->addr >> shift) & mask; +} + +static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +{ + u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ + u64 mask = BIT(pgt->ia_bits) - 1; + + return (addr & mask) >> shift; +} + +static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data) +{ + return __kvm_pgd_page_idx(data->pgt, data->addr); +} + +static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +{ + struct kvm_pgtable pgt = { + .ia_bits = ia_bits, + .start_level = start_level, + }; + + return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; +} + +static bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static bool kvm_pte_table(kvm_pte_t pte, u32 level) +{ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return false; + + if (!kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; +} + +static u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + + return pte; +} + +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte) +{ + return __va(kvm_pte_to_phys(pte)); +} + +static void kvm_set_invalid_pte(kvm_pte_t *ptep) +{ + kvm_pte_t pte = *ptep; + WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID); +} + +static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp)); + + pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); + pte |= KVM_PTE_VALID; + + WARN_ON(kvm_pte_valid(old)); + smp_store_release(ptep, pte); +} + +static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, + u32 level) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa); + u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; + + pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); + pte |= FIELD_PREP(KVM_PTE_TYPE, type); + pte |= KVM_PTE_VALID; + + /* Tolerate KVM recreating the exact same mapping. */ + if (kvm_pte_valid(old)) + return old == pte; + + smp_store_release(ptep, pte); + return true; +} + +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, + u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag) +{ + struct kvm_pgtable_walker *walker = data->walker; + return walker->cb(addr, data->end, level, ptep, flag, walker->arg); +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level); + +static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, + kvm_pte_t *ptep, u32 level) +{ + int ret = 0; + u64 addr = data->addr; + kvm_pte_t *childp, pte = *ptep; + bool table = kvm_pte_table(pte, level); + enum kvm_pgtable_walk_flags flags = data->walker->flags; + + if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_PRE); + } + + if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_LEAF); + pte = *ptep; + table = kvm_pte_table(pte, level); + } + + if (ret) + goto out; + + if (!table) { + data->addr += kvm_granule_size(level); + goto out; + } + + childp = kvm_pte_follow(pte); + ret = __kvm_pgtable_walk(data, childp, level + 1); + if (ret) + goto out; + + if (flags & KVM_PGTABLE_WALK_TABLE_POST) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_POST); + } + +out: + return ret; +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level) +{ + u32 idx; + int ret = 0; + + if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EINVAL; + + for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { + kvm_pte_t *ptep = &pgtable[idx]; + + if (data->addr >= data->end) + break; + + ret = __kvm_pgtable_visit(data, ptep, level); + if (ret) + break; + } + + return ret; +} + +static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) +{ + u32 idx; + int ret = 0; + struct kvm_pgtable *pgt = data->pgt; + u64 limit = BIT(pgt->ia_bits); + + if (data->addr > limit || data->end > limit) + return -ERANGE; + + if (!pgt->pgd) + return -EINVAL; + + for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { + kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + + ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); + if (ret) + break; + } + + return ret; +} + +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker) +{ + struct kvm_pgtable_walk_data walk_data = { + .pgt = pgt, + .addr = ALIGN_DOWN(addr, PAGE_SIZE), + .end = PAGE_ALIGN(walk_data.addr + size), + .walker = walker, + }; + + return _kvm_pgtable_walk(&walk_data); +} + +struct hyp_map_data { + u64 phys; + kvm_pte_t attr; +}; + +static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct hyp_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; + kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; + u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : + KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; + + if (!(prot & KVM_PGTABLE_PROT_R)) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_X) { + if (prot & KVM_PGTABLE_PROT_W) + return -EINVAL; + + if (device) + return -EINVAL; + } else { + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + } + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + data->attr = attr; + return 0; +} + +static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, struct hyp_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)); + data->phys += granule; + return true; +} + +static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t *childp; + + if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) + return 0; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!childp) + return -ENOMEM; + + kvm_set_table_pte(ptep, childp); + return 0; +} + +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot) +{ + int ret; + struct hyp_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + ret = hyp_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + isb(); + return ret; +} + +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) +{ + u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + + pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = va_bits; + pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->mmu = NULL; + return 0; +} + +static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + free_page((unsigned long)kvm_pte_follow(*ptep)); + return 0; +} + +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) +{ + struct kvm_pgtable_walker walker = { + .cb = hyp_free_walker, + .flags = KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + free_page((unsigned long)pgt->pgd); + pgt->pgd = NULL; +} + +struct stage2_map_data { + u64 phys; + kvm_pte_t attr; + + kvm_pte_t *anchor; + + struct kvm_s2_mmu *mmu; + struct kvm_mmu_memory_cache *memcache; +}; + +static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct stage2_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : + PAGE_S2_MEMATTR(NORMAL); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + + if (!(prot & KVM_PGTABLE_PROT_X)) + attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + else if (device) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_R) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + data->attr = attr; + return 0; +} + +static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)) + goto out; + + /* There's an existing valid leaf entry, so perform break-before-make */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + kvm_set_valid_leaf_pte(ptep, phys, data->attr, level); +out: + data->phys += granule; + return true; +} + +static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + if (data->anchor) + return 0; + + if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + return 0; + + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0); + data->anchor = ptep; + return 0; +} + +static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + kvm_pte_t *childp, pte = *ptep; + struct page *page = virt_to_page(ptep); + + if (data->anchor) { + if (kvm_pte_valid(pte)) + put_page(page); + + return 0; + } + + if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) + goto out_get_page; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + if (!data->memcache) + return -ENOMEM; + + childp = kvm_mmu_memory_cache_alloc(data->memcache); + if (!childp) + return -ENOMEM; + + /* + * If we've run into an existing block mapping then replace it with + * a table. Accesses beyond 'end' that fall within the new table + * will be mapped lazily. + */ + if (kvm_pte_valid(pte)) { + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + put_page(page); + } + + kvm_set_table_pte(ptep, childp); + +out_get_page: + get_page(page); + return 0; +} + +static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + int ret = 0; + + if (!data->anchor) + return 0; + + free_page((unsigned long)kvm_pte_follow(*ptep)); + put_page(virt_to_page(ptep)); + + if (data->anchor == ptep) { + data->anchor = NULL; + ret = stage2_map_walk_leaf(addr, end, level, ptep, data); + } + + return ret; +} + +/* + * This is a little fiddly, as we use all three of the walk flags. The idea + * is that the TABLE_PRE callback runs for table entries on the way down, + * looking for table entries which we could conceivably replace with a + * block entry for this mapping. If it finds one, then it sets the 'anchor' + * field in 'struct stage2_map_data' to point at the table entry, before + * clearing the entry to zero and descending into the now detached table. + * + * The behaviour of the LEAF callback then depends on whether or not the + * anchor has been set. If not, then we're not using a block mapping higher + * up the table and we perform the mapping at the existing leaves instead. + * If, on the other hand, the anchor _is_ set, then we drop references to + * all valid leaves so that the pages beneath the anchor can be freed. + * + * Finally, the TABLE_POST callback does nothing if the anchor has not + * been set, but otherwise frees the page-table pages while walking back up + * the page-table, installing the block entry when it revisits the anchor + * pointer and clearing the anchor to NULL. + */ +static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct stage2_map_data *data = arg; + + switch (flag) { + case KVM_PGTABLE_WALK_TABLE_PRE: + return stage2_map_walk_table_pre(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_LEAF: + return stage2_map_walk_leaf(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_map_walk_table_post(addr, end, level, ptep, data); + } + + return -EINVAL; +} + +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc) +{ + int ret; + struct stage2_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + .mmu = pgt->mmu, + .memcache = mc, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + .arg = &map_data, + }; + + ret = stage2_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} + +static void stage2_flush_dcache(void *addr, u64 size) +{ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + + __flush_dcache_area(addr, size); +} + +static bool stage2_pte_cacheable(kvm_pte_t pte) +{ + u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte); + return memattr == PAGE_S2_MEMATTR(NORMAL); +} + +static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct kvm_s2_mmu *mmu = arg; + kvm_pte_t pte = *ptep, *childp = NULL; + bool need_flush = false; + + if (!kvm_pte_valid(pte)) + return 0; + + if (kvm_pte_table(pte, level)) { + childp = kvm_pte_follow(pte); + + if (page_count(virt_to_page(childp)) != 1) + return 0; + } else if (stage2_pte_cacheable(pte)) { + need_flush = true; + } + + /* + * This is similar to the map() path in that we unmap the entire + * block entry and rely on the remaining portions being faulted + * back lazily. + */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + put_page(virt_to_page(ptep)); + + if (need_flush) { + stage2_flush_dcache(kvm_pte_follow(pte), + kvm_granule_size(level)); + } + + if (childp) + free_page((unsigned long)childp); + + return 0; +} + +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_unmap_walker, + .arg = pgt->mmu, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +struct stage2_attr_data { + kvm_pte_t attr_set; + kvm_pte_t attr_clr; + kvm_pte_t pte; + u32 level; +}; + +static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + struct stage2_attr_data *data = arg; + + if (!kvm_pte_valid(pte)) + return 0; + + data->level = level; + data->pte = pte; + pte &= ~data->attr_clr; + pte |= data->attr_set; + + /* + * We may race with the CPU trying to set the access flag here, + * but worst-case the access flag update gets lost and will be + * set on the next access instead. + */ + if (data->pte != pte) + WRITE_ONCE(*ptep, pte); + + return 0; +} + +static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, + u64 size, kvm_pte_t attr_set, + kvm_pte_t attr_clr, kvm_pte_t *orig_pte, + u32 *level) +{ + int ret; + kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; + struct stage2_attr_data data = { + .attr_set = attr_set & attr_mask, + .attr_clr = attr_clr & attr_mask, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_attr_walker, + .arg = &data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (ret) + return ret; + + if (orig_pte) + *orig_pte = data.pte; + + if (level) + *level = data.level; + return 0; +} + +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + return stage2_update_leaf_attrs(pgt, addr, size, 0, + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + NULL, NULL); +} + +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + &pte, NULL); + dsb(ishst); + return pte; +} + +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, + &pte, NULL); + /* + * "But where's the TLBI?!", you scream. + * "Over in the core code", I sigh. + * + * See the '->clear_flush_young()' callback on the KVM mmu notifier. + */ + return pte; +} + +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL); + return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; +} + +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot) +{ + int ret; + u32 level; + kvm_pte_t set = 0, clr = 0; + + if (prot & KVM_PGTABLE_PROT_R) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + if (prot & KVM_PGTABLE_PROT_X) + clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level); + if (!ret) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + return ret; +} + +static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte)) + return 0; + + stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level)); + return 0; +} + +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_flush_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return 0; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) +{ + size_t pgd_sz; + u64 vtcr = kvm->arch.vtcr; + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; + pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = ia_bits; + pgt->start_level = start_level; + pgt->mmu = &kvm->arch.mmu; + + /* Ensure zeroed PGD pages are visible to the hardware walker */ + dsb(ishst); + return 0; +} + +static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte)) + return 0; + + put_page(virt_to_page(ptep)); + + if (kvm_pte_table(pte, level)) + free_page((unsigned long)kvm_pte_follow(pte)); + + return 0; +} + +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; + free_pages_exact(pgt->pgd, pgd_sz); + pgt->pgd = NULL; +} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ecf67e678203..fe69de16dadc 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -28,6 +28,11 @@ const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; +/* VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) write_sysreg(val, cpacr_el1); - write_sysreg(kvm_get_hyp_vector(), vbar_el1); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } NOKPROBE_SYMBOL(__activate_traps); @@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt; u64 exit_code; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -120,12 +125,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * HCR_EL2.TGE. * * We have already configured the guest's stage 1 translation in - * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm - * before __activate_traps, because __activate_vm configures - * stage 2 translation, and __activate_traps clear HCR_EL2.TGE - * (among other things). + * kvm_vcpu_load_sysregs_vhe above. We must now call + * __load_guest_stage2 before __activate_traps, because + * __load_guest_stage2 configures stage 2 translation, and + * __activate_traps clear HCR_EL2.TGE (among other things). */ - __activate_vm(vcpu->arch.hw_mmu); + __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); @@ -133,7 +138,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); @@ -188,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return ret; } -static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, - struct kvm_cpu_context *host_ctxt) +static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) { + struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); @@ -204,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, } NOKPROBE_SYMBOL(__hyp_call_panic); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - __hyp_call_panic(spsr, elr, par, host_ctxt); + __hyp_call_panic(spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 996471e4c138..2a0b8c88d74f 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; __sysreg_save_user_state(host_ctxt); /* @@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; deactivate_traps_vhe_put(); __sysreg_save_el1_state(guest_ctxt); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ebfdfc27b2bd..34a96ab244fa 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) /** * kvm_inject_undefined - inject an undefined instruction into the guest + * @vcpu: The vCPU in which to inject the exception * * It is assumed that this code is called from the VCPU thread and that the * VCPU therefore is not currently executing guest code. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3d26b47a1343..19aacc7d64de 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -14,6 +14,7 @@ #include <asm/cacheflush.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -21,9 +22,7 @@ #include "trace.h" -static pgd_t *boot_hyp_pgd; -static pgd_t *hyp_pgd; -static pgd_t *merged_hyp_pgd; +static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); static unsigned long hyp_idmap_start; @@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) -#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) - -static bool is_iomap(unsigned long flags) +/* + * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, + * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, + * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too + * long will also starve other vCPUs. We have to also make sure that the page + * tables are not freed while we released the lock. + */ +static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end, + int (*fn)(struct kvm_pgtable *, u64, u64), + bool resched) { - return flags & KVM_S2PTE_FLAG_IS_IOMAP; + int ret; + u64 next; + + do { + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = stage2_pgd_addr_end(kvm, addr, end); + ret = fn(pgt, addr, next - addr); + if (ret) + break; + + if (resched && next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (addr = next, addr != end); + + return ret; } +#define stage2_apply_range_resched(kvm, addr, end, fn) \ + stage2_apply_range(kvm, addr, end, fn, true) + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } -static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, - int level) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level); -} - -/* - * D-Cache management functions. They take the page table entries by - * value, as they are flushing the cache using the kernel mapping (or - * kmap on 32bit). - */ -static void kvm_flush_dcache_pte(pte_t pte) -{ - __kvm_flush_dcache_pte(pte); -} - -static void kvm_flush_dcache_pmd(pmd_t pmd) -{ - __kvm_flush_dcache_pmd(pmd); -} - -static void kvm_flush_dcache_pud(pud_t pud) -{ - __kvm_flush_dcache_pud(pud); -} - static bool kvm_is_device_pfn(unsigned long pfn) { return !pfn_valid(pfn); } -/** - * stage2_dissolve_pmd() - clear and flush huge PMD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pmd: pmd pointer for IPA - * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd) -{ - if (!pmd_thp_or_huge(*pmd)) - return; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - put_page(virt_to_page(pmd)); -} - -/** - * stage2_dissolve_pud() - clear and flush huge PUD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pud: pud pointer for IPA - * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp) -{ - struct kvm *kvm = mmu->kvm; - - if (!stage2_pud_huge(kvm, *pudp)) - return; - - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - put_page(virt_to_page(pudp)); -} - -static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); - stage2_pgd_clear(kvm, pgd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_p4d_free(kvm, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); - stage2_p4d_clear(kvm, p4d); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pud_free(kvm, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); - - VM_BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pmd_free(kvm, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - free_page((unsigned long)pte_table); - put_page(virt_to_page(pmd)); -} - -static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) -{ - WRITE_ONCE(*ptep, new_pte); - dsb(ishst); -} - -static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) -{ - WRITE_ONCE(*pmdp, new_pmd); - dsb(ishst); -} - -static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) -{ - kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); -} - -static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) -{ - WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); - dsb(ishst); -} - -static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) -{ - WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); - dsb(ishst); -} - -static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) -{ -#ifndef __PAGETABLE_P4D_FOLDED - WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); - dsb(ishst); -#endif -} - /* * Unmapping vs dcache management: * @@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) * end up writing old data to disk. * * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. + * the corresponding TLBs, we flush to make sure the IO subsystem will + * never hit in the cache. * * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as * we then fully enforce cacheability of RAM, no matter what the guest * does. */ -static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t start_addr = addr; - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - pte_t old_pte = *pte; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - - /* No need to invalidate the cache for device mappings */ - if (!kvm_is_device_pfn(pte_pfn(old_pte))) - kvm_flush_dcache_pte(old_pte); - - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (stage2_pte_table_empty(mmu->kvm, start_pte)) - clear_stage2_pmd_entry(mmu, pmd, start_addr); -} - -static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - pmd_t old_pmd = *pmd; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - - kvm_flush_dcache_pmd(old_pmd); - - put_page(virt_to_page(pmd)); - } else { - unmap_stage2_ptes(mmu, pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); - - if (stage2_pmd_table_empty(kvm, start_pmd)) - clear_stage2_pud_entry(mmu, pud, start_addr); -} - -static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pud_t *pud, *start_pud; - - start_pud = pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - pud_t old_pud = *pud; - - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); - } else { - unmap_stage2_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); - - if (stage2_pud_table_empty(kvm, start_pud)) - clear_stage2_p4d_entry(mmu, p4d, start_addr); -} - -static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - unmap_stage2_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (stage2_p4d_table_empty(kvm, start_p4d)) - clear_stage2_pgd_entry(mmu, pgd, start_addr); -} - /** * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer + * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap + * @may_block: Whether or not we are permitted to block * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before @@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 bool may_block) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; + phys_addr_t end = start + size; assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_p4ds(mmu, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (may_block && next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, + may_block)); } static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) @@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si __unmap_stage2_range(mmu, start, size, true); } -static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) - kvm_flush_dcache_pte(*pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) - kvm_flush_dcache_pmd(*pmd); - else - stage2_flush_ptes(mmu, pmd, addr, next); - } - } while (pmd++, addr = next, addr != end); -} - -static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) - kvm_flush_dcache_pud(*pud); - else - stage2_flush_pmds(mmu, pud, addr, next); - } - } while (pud++, addr = next, addr != end); -} - -static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_flush_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { - struct kvm_s2_mmu *mmu = &kvm->arch.mmu; phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - phys_addr_t next; - pgd_t *pgd; - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_p4ds(mmu, pgd, addr, next); - - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); } /** @@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } -static void clear_hyp_pgd_entry(pgd_t *pgd) -{ - p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); - pgd_clear(pgd); - p4d_free(NULL, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_hyp_p4d_entry(p4d_t *p4d) -{ - pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); - VM_BUG_ON(p4d_huge(*p4d)); - p4d_clear(p4d); - pud_free(NULL, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_hyp_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_hyp_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (hyp_pte_table_empty(start_pte)) - clear_hyp_pmd_entry(pmd); -} - -static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - /* Hyp doesn't use huge pmds */ - if (!pmd_none(*pmd)) - unmap_hyp_ptes(pmd, addr, next); - } while (pmd++, addr = next, addr != end); - - if (hyp_pmd_table_empty(start_pmd)) - clear_hyp_pud_entry(pud); -} - -static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pud_t *pud, *start_pud; - - start_pud = pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - /* Hyp doesn't use huge puds */ - if (!pud_none(*pud)) - unmap_hyp_pmds(pud, addr, next); - } while (pud++, addr = next, addr != end); - - if (hyp_pud_table_empty(start_pud)) - clear_hyp_p4d_entry(p4d); -} - -static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - /* Hyp doesn't use huge p4ds */ - if (!p4d_none(*p4d)) - unmap_hyp_puds(p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (hyp_p4d_table_empty(start_p4d)) - clear_hyp_pgd_entry(pgd); -} - -static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) -{ - return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); -} - -static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, - phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - /* - * We don't unmap anything from HYP, except at the hyp tear down. - * Hence, we don't have to invalidate the TLBs here. - */ - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - do { - next = pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_hyp_p4ds(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); -} - -static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); -} - /** * free_hyp_pgds - free Hyp-mode page tables - * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and - * therefore contains either mappings in the kernel memory area (above - * PAGE_OFFSET), or device mappings in the idmap range. - * - * boot_hyp_pgd should only map the idmap range, and is only used in - * the extended idmap case. */ void free_hyp_pgds(void) { - pgd_t *id_pgd; - mutex_lock(&kvm_hyp_pgd_mutex); - - id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; - - if (id_pgd) { - /* In case we never called hyp_mmu_init() */ - if (!io_map_base) - io_map_base = hyp_idmap_start; - unmap_hyp_idmap_range(id_pgd, io_map_base, - hyp_idmap_start + PAGE_SIZE - io_map_base); + if (hyp_pgtable) { + kvm_pgtable_hyp_destroy(hyp_pgtable); + kfree(hyp_pgtable); } - - if (boot_hyp_pgd) { - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) { - unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), - (uintptr_t)high_memory - PAGE_OFFSET); - - free_pages((unsigned long)hyp_pgd, hyp_pgd_order); - hyp_pgd = NULL; - } - if (merged_hyp_pgd) { - clear_page(merged_hyp_pgd); - free_page((unsigned long)merged_hyp_pgd); - merged_hyp_pgd = NULL; - } - mutex_unlock(&kvm_hyp_pgd_mutex); } -static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pte_t *pte; - unsigned long addr; - - addr = start; - do { - pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); - get_page(virt_to_page(pte)); - pfn++; - } while (addr += PAGE_SIZE, addr != end); -} - -static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pmd_t *pmd; - pte_t *pte; - unsigned long addr, next; - - addr = start; - do { - pmd = pmd_offset(pud, addr); - - BUG_ON(pmd_sect(*pmd)); - - if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL); - if (!pte) { - kvm_err("Cannot allocate Hyp pte\n"); - return -ENOMEM; - } - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - next = pmd_addr_end(addr, end); - - create_hyp_pte_mappings(pmd, addr, next, pfn, prot); - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pud_t *pud; - pmd_t *pmd; - unsigned long addr, next; - int ret; - - addr = start; - do { - pud = pud_offset(p4d, addr); - - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); - return -ENOMEM; - } - kvm_pud_populate(pud, pmd); - get_page(virt_to_page(pud)); - } - - next = pud_addr_end(addr, end); - ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) +static int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { - p4d_t *p4d; - pud_t *pud; - unsigned long addr, next; - int ret; - - addr = start; - do { - p4d = p4d_offset(pgd, addr); - - if (p4d_none(*p4d)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); - return -ENOMEM; - } - kvm_p4d_populate(p4d, pud); - get_page(virt_to_page(p4d)); - } - - next = p4d_addr_end(addr, end); - ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned long addr, next; - int err = 0; + int err; mutex_lock(&kvm_hyp_pgd_mutex); - addr = start & PAGE_MASK; - end = PAGE_ALIGN(end); - do { - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - - if (pgd_none(*pgd)) { - p4d = p4d_alloc_one(NULL, addr); - if (!p4d) { - kvm_err("Cannot allocate Hyp p4d\n"); - err = -ENOMEM; - goto out; - } - kvm_pgd_populate(pgd, p4d); - get_page(virt_to_page(pgd)); - } - - next = pgd_addr_end(addr, end); - err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); - if (err) - goto out; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); -out: + err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); + return err; } @@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ -int create_hyp_mappings(void *from, void *to, pgprot_t prot) +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) { phys_addr_t phys_addr; unsigned long virt_addr; @@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, - virt_addr, virt_addr + PAGE_SIZE, - __phys_to_pfn(phys_addr), + err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, prot); if (err) return err; @@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) } static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, pgprot_t prot) + unsigned long *haddr, + enum kvm_pgtable_prot prot) { - pgd_t *pgd = hyp_pgd; unsigned long base; int ret = 0; @@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, if (ret) goto out; - if (__kvm_cpu_uses_extended_idmap()) - pgd = boot_hyp_pgd; - - ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - base, base + size, - __phys_to_pfn(phys_addr), prot); + ret = __create_hyp_mappings(base, size, phys_addr, prot); if (ret) goto out; *haddr = base + offset_in_page(phys_addr); - out: return ret; } @@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure * - * Allocates only the stage-2 HW PGD level table(s) of size defined by - * stage2_pgd_size(mmu->kvm). - * + * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) { - phys_addr_t pgd_phys; - pgd_t *pgd; - int cpu; + int cpu, err; + struct kvm_pgtable *pgt; - if (mmu->pgd != NULL) { + if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); - if (!pgd) + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + if (!pgt) return -ENOMEM; - pgd_phys = virt_to_phys(pgd); - if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) - return -EINVAL; + err = kvm_pgtable_stage2_init(pgt, kvm); + if (err) + goto out_free_pgtable; mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); - return -ENOMEM; + err = -ENOMEM; + goto out_destroy_pgtable; } for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; mmu->kvm = kvm; - mmu->pgd = pgd; - mmu->pgd_phys = pgd_phys; + mmu->pgt = pgt; + mmu->pgd_phys = __pa(pgt->pgd); mmu->vmid.vmid_gen = 0; - return 0; + +out_destroy_pgtable: + kvm_pgtable_stage2_destroy(pgt); +out_free_pgtable: + kfree(pgt); + return err; } static void stage2_unmap_memslot(struct kvm *kvm, @@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = mmu->kvm; - void *pgd = NULL; + struct kvm_pgtable *pgt = NULL; spin_lock(&kvm->mmu_lock); - if (mmu->pgd) { - unmap_stage2_range(mmu, 0, kvm_phys_size(kvm)); - pgd = READ_ONCE(mmu->pgd); - mmu->pgd = NULL; - } - spin_unlock(&kvm->mmu_lock); - - /* Free the HW pgd, one page at a time */ - if (pgd) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); + pgt = mmu->pgt; + if (pgt) { + mmu->pgd_phys = 0; + mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } -} - -static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - p4d_t *p4d; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - if (stage2_pgd_none(kvm, *pgd)) { - if (!cache) - return NULL; - p4d = kvm_mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, p4d); - get_page(virt_to_page(pgd)); - } - - return stage2_p4d_offset(kvm, pgd, addr); -} - -static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - pud_t *pud; - - p4d = stage2_get_p4d(mmu, cache, addr); - if (stage2_p4d_none(kvm, *p4d)) { - if (!cache) - return NULL; - pud = kvm_mmu_memory_cache_alloc(cache); - stage2_p4d_populate(kvm, p4d, pud); - get_page(virt_to_page(p4d)); - } - - return stage2_pud_offset(kvm, p4d, addr); -} - -static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - - pud = stage2_get_pud(mmu, cache, addr); - if (!pud || stage2_pud_huge(kvm, *pud)) - return NULL; - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return NULL; - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - return stage2_pmd_offset(kvm, pud, addr); -} - -static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pmd_t *new_pmd) -{ - pmd_t *pmd, old_pmd; - -retry: - pmd = stage2_get_pmd(mmu, cache, addr); - VM_BUG_ON(!pmd); - - old_pmd = *pmd; - /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. - * - * Skip updating the page table if the entry is - * unchanged. - */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - - if (pmd_present(old_pmd)) { - /* - * If we already have PTE level mapping for this block, - * we must unmap it to avoid inconsistent TLB state and - * leaking the table page. We could end up in this situation - * if the memory slot was marked for dirty logging and was - * reverted, leaving PTE level mappings for the pages accessed - * during the period. So, unmap the PTE level mapping for this - * block and retry, as we could have released the upper level - * table in the process. - * - * Normal THP split/merge follows mmu_notifier callbacks and do - * get handled accordingly. - */ - if (!pmd_thp_or_huge(old_pmd)) { - unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE); - goto retry; - } - /* - * Mapping in huge pages should only happen through a - * fault. If a page is merged into a transparent huge - * page, the individual subpages of that huge page - * should be unmapped through MMU notifiers before we - * get here. - * - * Merging of CompoundPages is not supported; they - * should become splitting first, unmapped, merged, - * and mapped back in on-demand. - */ - WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - } else { - get_page(virt_to_page(pmd)); - } - - kvm_set_pmd(pmd, *new_pmd); - return 0; -} - -static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pud_t *new_pudp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp, old_pud; - -retry: - pudp = stage2_get_pud(mmu, cache, addr); - VM_BUG_ON(!pudp); - - old_pud = *pudp; - - /* - * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). - * Skip updating the page tables if there is no change. - */ - if (pud_val(old_pud) == pud_val(*new_pudp)) - return 0; - - if (stage2_pud_present(kvm, old_pud)) { - /* - * If we already have table level mapping for this block, unmap - * the range for this block and retry. - */ - if (!stage2_pud_huge(kvm, old_pud)) { - unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE); - goto retry; - } - - WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - } else { - get_page(virt_to_page(pudp)); - } - - kvm_set_pud(pudp, *new_pudp); - return 0; -} - -/* - * stage2_get_leaf_entry - walk the stage2 VM page tables and return - * true if a valid and present leaf-entry is found. A pointer to the - * leaf-entry is returned in the appropriate level variable - pudpp, - * pmdpp, ptepp. - */ -static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr, - pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - *pudpp = NULL; - *pmdpp = NULL; - *ptepp = NULL; - - pudp = stage2_get_pud(mmu, NULL, addr); - if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) - return false; - - if (stage2_pud_huge(kvm, *pudp)) { - *pudpp = pudp; - return true; - } - - pmdp = stage2_pmd_offset(kvm, pudp, addr); - if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) - return false; - - if (pmd_thp_or_huge(*pmdp)) { - *pmdpp = pmdp; - return true; - } - - ptep = pte_offset_kernel(pmdp, addr); - if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) - return false; - - *ptepp = ptep; - return true; -} - -static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - bool found; - - found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep); - if (!found) - return false; - - if (pudp) - return sz <= PUD_SIZE && kvm_s2pud_exec(pudp); - else if (pmdp) - return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp); - else - return sz == PAGE_SIZE && kvm_s2pte_exec(ptep); -} - -static int stage2_set_pte(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, - unsigned long flags) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, old_pte; - bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; - bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - - VM_BUG_ON(logging_active && !cache); - - /* Create stage-2 page table mapping - Levels 0 and 1 */ - pud = stage2_get_pud(mmu, cache, addr); - if (!pud) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PUD, then continue - * on to allocate page. - */ - if (logging_active) - stage2_dissolve_pud(mmu, addr, pud); - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - pmd = stage2_pmd_offset(kvm, pud, addr); - if (!pmd) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PMD, then continue on to - * allocate page. - */ - if (logging_active) - stage2_dissolve_pmd(mmu, addr, pmd); - - /* Create stage-2 page mappings - Level 2 */ - if (pmd_none(*pmd)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pte = kvm_mmu_memory_cache_alloc(cache); - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - pte = pte_offset_kernel(pmd, addr); - - if (iomap && pte_present(*pte)) - return -EFAULT; - - /* Create 2nd stage page table mapping - Level 3 */ - old_pte = *pte; - if (pte_present(old_pte)) { - /* Skip page table update if there is no change */ - if (pte_val(old_pte) == pte_val(*new_pte)) - return 0; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - } else { - get_page(virt_to_page(pte)); - } - - kvm_set_pte(pte, *new_pte); - return 0; -} + spin_unlock(&kvm->mmu_lock); -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); - return 1; + if (pgt) { + kvm_pgtable_stage2_destroy(pgt); + kfree(pgt); } - return 0; -} -#else -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - return __ptep_test_and_clear_young(pte); -} -#endif - -static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pmd); -} - -static int stage2_pudp_test_and_clear_young(pud_t *pud) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pud); } /** @@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud) * @guest_ipa: The IPA at which to insert the mapping * @pa: The physical address of the device * @size: The size of the mapping + * @writable: Whether or not to create a writable mapping */ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable) { - phys_addr_t addr, end; + phys_addr_t addr; int ret = 0; - unsigned long pfn; struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_R | + (writable ? KVM_PGTABLE_PROT_W : 0); - end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; - pfn = __phys_to_pfn(pa); - - for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); - - if (writable) - pte = kvm_s2pte_mkwrite(pte); + size += offset_in_page(guest_ipa); + guest_ipa &= PAGE_MASK; + for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(kvm)); if (ret) - goto out; + break; + spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte, - KVM_S2PTE_FLAG_IS_IOMAP); + ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, + &cache); spin_unlock(&kvm->mmu_lock); if (ret) - goto out; + break; - pfn++; + pa += PAGE_SIZE; } -out: kvm_mmu_free_memory_cache(&cache); return ret; } /** - * stage2_wp_ptes - write protect PMD range - * @pmd: pointer to pmd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - if (!kvm_s2pte_readonly(pte)) - kvm_set_s2pte_readonly(pte); - } - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -/** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM - * @pud: pointer to pud entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - if (!kvm_s2pmd_readonly(pmd)) - kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); -} - -/** - * stage2_wp_puds - write protect P4D range - * @p4d: pointer to p4d entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - if (!kvm_s2pud_readonly(pud)) - kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); -} - -/** - * stage2_wp_p4ds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_wp_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -/** * stage2_wp_range() - write protect stage2 memory region range - * @kvm: The KVM pointer + * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t next; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Release kvm_mmu_lock periodically if the memory region is - * large. Otherwise, we may see kernel panics with - * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, - * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. We have to also make sure - * that the page tables are not freed while we released - * the lock. - */ - cond_resched_lock(&kvm->mmu_lock); - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_p4ds(mmu, pgd, addr, next); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); } /** @@ -1833,20 +742,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { - int ret; + int ret = 0; bool write_fault, writable, force_pte = false; - bool exec_fault, needs_exec; + bool exec_fault; + bool device = false; unsigned long mmu_seq; - gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + gfn_t gfn; kvm_pfn_t pfn; - pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long vma_pagesize, flags = 0; - struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + unsigned long vma_pagesize; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); @@ -1871,31 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else vma_shift = PAGE_SHIFT; - vma_pagesize = 1ULL << vma_shift; if (logging_active || - (vma->vm_flags & VM_PFNMAP) || - !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + (vma->vm_flags & VM_PFNMAP)) { force_pte = true; - vma_pagesize = PAGE_SIZE; vma_shift = PAGE_SHIFT; } - /* - * The stage2 has a minimum of 2 level table (For arm64 see - * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can - * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). - * As for PUD huge maps, we must make sure that we have at least - * 3 levels, i.e, PMD is not folded. - */ - if (vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) - gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + if (vma_shift == PUD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + vma_shift = PMD_SHIFT; + + if (vma_shift == PMD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + force_pte = true; + vma_shift = PAGE_SHIFT; + } + + vma_pagesize = 1UL << vma_shift; + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + fault_ipa &= ~(vma_pagesize - 1); + + gfn = fault_ipa >> PAGE_SHIFT; mmap_read_unlock(current->mm); - /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != FSC_PERM || (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } mmu_seq = vcpu->kvm->mmu_notifier_seq; /* @@ -1918,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; if (kvm_is_device_pfn(pfn)) { - mem_type = PAGE_S2_DEVICE; - flags |= KVM_S2PTE_FLAG_IS_IOMAP; - } else if (logging_active) { - /* - * Faults on pages in a memslot with logging enabled - * should not be mapped with huge pages (it introduces churn - * and performance degradation), so force a pte mapping. - */ - flags |= KVM_S2_FLAG_LOGGING_ACTIVE; - + device = true; + } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write * fault. */ - if (!write_fault) - writable = false; + writable = false; } - if (exec_fault && is_iomap(flags)) + if (exec_fault && device) return -ENOEXEC; spin_lock(&kvm->mmu_lock); + pgt = vcpu->arch.hw_mmu->pgt; if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; @@ -1950,67 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); - if (writable) + if (writable) { + prot |= KVM_PGTABLE_PROT_W; kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); + } - if (fault_status != FSC_PERM && !is_iomap(flags)) + if (fault_status != FSC_PERM && !device) clean_dcache_guest_page(pfn, vma_pagesize); - if (exec_fault) + if (exec_fault) { + prot |= KVM_PGTABLE_PROT_X; invalidate_icache_guest_page(pfn, vma_pagesize); + } - /* - * If we took an execution fault we have made the - * icache/dcache coherent above and should now let the s2 - * mapping be executable. - * - * Write faults (!exec_fault && FSC_PERM) are orthogonal to - * execute permissions, and we preserve whatever we have. - */ - needs_exec = exec_fault || - (fault_status == FSC_PERM && - stage2_is_exec(mmu, fault_ipa, vma_pagesize)); - - /* - * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and - * all we have is a 2-level page table. Trying to map a PUD in - * this case would be fatally wrong. - */ - if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) { - pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - - new_pud = kvm_pud_mkhuge(new_pud); - if (writable) - new_pud = kvm_s2pud_mkwrite(new_pud); - - if (needs_exec) - new_pud = kvm_s2pud_mkexec(new_pud); - - ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud); - } else if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); - - new_pmd = kvm_pmd_mkhuge(new_pmd); - - if (writable) - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - - if (needs_exec) - new_pmd = kvm_s2pmd_mkexec(new_pmd); + if (device) + prot |= KVM_PGTABLE_PROT_DEVICE; + else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; - ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd); + if (fault_status == FSC_PERM && !(logging_active && writable)) { + ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); } else { - pte_t new_pte = kvm_pfn_pte(pfn, mem_type); - - if (writable) { - new_pte = kvm_s2pte_mkwrite(new_pte); - mark_page_dirty(kvm, gfn); - } - - if (needs_exec) - new_pte = kvm_s2pte_mkexec(new_pte); - - ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags); + ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + __pfn_to_phys(pfn), prot, + memcache); } out_unlock: @@ -2020,46 +896,23 @@ out_unlock: return ret; } -/* - * Resolve the access fault by making the page young again. - * Note that because the faulting entry is guaranteed not to be - * cached in the TLB, we don't need to invalidate anything. - * Only the HW Access Flag updates are supported for Stage 2 (no DBM), - * so there is no need for atomic (pte|pmd)_mkyoung operations. - */ +/* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - kvm_pfn_t pfn; - bool pfn_valid = false; + pte_t pte; + kvm_pte_t kpte; + struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); spin_lock(&vcpu->kvm->mmu_lock); - - if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte)) - goto out; - - if (pud) { /* HugeTLB */ - *pud = kvm_s2pud_mkyoung(*pud); - pfn = kvm_pud_pfn(*pud); - pfn_valid = true; - } else if (pmd) { /* THP, HugeTLB */ - *pmd = pmd_mkyoung(*pmd); - pfn = pmd_pfn(*pmd); - pfn_valid = true; - } else { - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; - } - -out: + mmu = vcpu->arch.hw_mmu; + kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); spin_unlock(&vcpu->kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); + + pte = __pte(kpte); + if (pte_valid(pte)) + kvm_set_pfn_accessed(pte_pfn(pte)); } /** @@ -2230,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_unmap_hva_range(start, end); @@ -2240,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm, static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pte_t *pte = (pte_t *)data; + kvm_pfn_t *pfn = (kvm_pfn_t *)data; WARN_ON(size != PAGE_SIZE); + /* - * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE - * flag clear because MMU notifiers will have unmapped a huge PMD before - * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore stage2_set_pte() never needs to clear out a huge PMD - * through this calling path. + * The MMU notifiers will have unmapped a huge PMD before calling + * ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore we never need to clear out a huge PMD through this + * calling path and a memcache is not required. */ - stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0); + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, + __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); return 0; } - int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_set_spte_hva(hva); @@ -2271,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); - + handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pte_t pte; + kvm_pte_t kpte; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return stage2_pudp_test_and_clear_young(pud); - else if (pmd) - return stage2_pmdp_test_and_clear_young(pmd); - else - return stage2_ptep_test_and_clear_young(pte); + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + pte = __pte(kpte); + return pte_valid(pte) && pte_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return kvm_s2pud_young(*pud); - else if (pmd) - return pmd_young(*pmd); - else - return pte_young(*pte); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_age_hva(start, end); return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); @@ -2323,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_test_age_hva(hva); return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, kvm_test_age_hva_handler, NULL); } -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); -} - phys_addr_t kvm_mmu_get_httbr(void) { - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); + return __pa(hyp_pgtable->pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -2348,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } -static int kvm_map_idmap_text(pgd_t *pgd) +static int kvm_map_idmap_text(void) { - int err; - - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); + unsigned long size = hyp_idmap_end - hyp_idmap_start; + int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, + PAGE_HYP_EXEC); if (err) kvm_err("Failed to idmap %lx-%lx\n", hyp_idmap_start, hyp_idmap_end); @@ -2367,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd) int kvm_mmu_init(void) { int err; + u32 hyp_va_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -2380,6 +1200,8 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), @@ -2397,43 +1219,30 @@ int kvm_mmu_init(void) goto out; } - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - if (!hyp_pgd) { - kvm_err("Hyp mode PGD not allocated\n"); + hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); + if (!hyp_pgtable) { + kvm_err("Hyp mode page-table not allocated\n"); err = -ENOMEM; goto out; } - if (__kvm_cpu_uses_extended_idmap()) { - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - hyp_pgd_order); - if (!boot_hyp_pgd) { - kvm_err("Hyp boot PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - err = kvm_map_idmap_text(boot_hyp_pgd); - if (err) - goto out; + err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits); + if (err) + goto out_free_pgtable; - merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!merged_hyp_pgd) { - kvm_err("Failed to allocate extra HYP pgd\n"); - goto out; - } - __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, - hyp_idmap_start); - } else { - err = kvm_map_idmap_text(hyp_pgd); - if (err) - goto out; - } + err = kvm_map_idmap_text(); + if (err) + goto out_destroy_pgtable; io_map_base = hyp_idmap_start; return 0; + +out_destroy_pgtable: + kvm_pgtable_hyp_destroy(hyp_pgtable); +out_free_pgtable: + kfree(hyp_pgtable); + hyp_pgtable = NULL; out: - free_hyp_pgds(); return err; } @@ -2537,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); - else + else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_flush_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); out: diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 81916e360b1e..2ed5ef8f274b 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 +static u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + switch (kvm->arch.pmuver) { + case 1: /* ARMv8.0 */ + return GENMASK(9, 0); + case 4: /* ARMv8.1 */ + case 5: /* ARMv8.4 */ + case 6: /* ARMv8.5 */ + return GENMASK(15, 0); + default: /* Shouldn't be here, just for sanity */ + WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver); + return 0; + } +} + /** * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter * @vcpu: The vcpu pointer @@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) return false; reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; + eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm); return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; } @@ -516,7 +531,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) /* PMSWINC only applies to ... SW_INC! */ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= ARMV8_PMU_EVTYPE_EVENT; + type &= kvm_pmu_event_mask(vcpu->kvm); if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) continue; @@ -599,11 +614,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) data = __vcpu_sys_reg(vcpu, reg); kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + else + eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + + /* Software increment event doesn't need to be backed by a perf event */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) + return; - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - pmc->idx != ARMV8_PMU_CYCLE_IDX) + /* + * If we have a filter in place and that the event isn't allowed, do + * not install a perf event either. + */ + if (vcpu->kvm->arch.pmu_filter && + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; memset(&attr, 0, sizeof(struct perf_event_attr)); @@ -615,8 +640,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ - attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; + attr.config = eventsel; counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); @@ -700,17 +724,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { - u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; + u64 reg, mask; + + mask = ARMV8_PMU_EVTYPE_MASK; + mask &= ~ARMV8_PMU_EVTYPE_EVENT; + mask |= kvm_pmu_event_mask(vcpu->kvm); reg = (select_idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) = event_type; + __vcpu_sys_reg(vcpu, reg) = data & mask; kvm_pmu_update_pmc_chained(vcpu, select_idx); kvm_pmu_create_perf_event(vcpu, select_idx); } +static int kvm_pmu_probe_pmuver(void) +{ + struct perf_event_attr attr = { }; + struct perf_event *event; + struct arm_pmu *pmu; + int pmuver = 0xf; + + /* + * Create a dummy event that only counts user cycles. As we'll never + * leave this function with the event being live, it will never + * count anything. But it allows us to probe some of the PMU + * details. Yes, this is terrible. + */ + attr.type = PERF_TYPE_RAW; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = 0; + attr.exclude_user = 0; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.exclude_host = 1; + attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + attr.sample_period = GENMASK(63, 0); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, &attr); + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return 0xf; + } + + if (event->pmu) { + pmu = to_arm_pmu(event->pmu); + if (pmu->pmuver) + pmuver = pmu->pmuver; + } + + perf_event_disable(event); + perf_event_release_kernel(event); + + return pmuver; +} + +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + unsigned long *bmap = vcpu->kvm->arch.pmu_filter; + u64 val, mask = 0; + int base, i; + + if (!pmceid1) { + val = read_sysreg(pmceid0_el0); + base = 0; + } else { + val = read_sysreg(pmceid1_el0); + base = 32; + } + + if (!bmap) + return val; + + for (i = 0; i < 32; i += 8) { + u64 byte; + + byte = bitmap_get_value8(bmap, base + i); + mask |= byte << i; + byte = bitmap_get_value8(bmap, 0x4000 + base + i); + mask |= byte << (32 + i); + } + + return val & mask; +} + bool kvm_arm_support_pmu_v3(void) { /* @@ -756,15 +858,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) { - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENXIO; - - if (vcpu->arch.pmu.created) - return -EBUSY; - if (irqchip_in_kernel(vcpu->kvm)) { int ret; @@ -820,6 +913,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { + if (!kvm_arm_support_pmu_v3() || + !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + if (!vcpu->kvm->arch.pmuver) + vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); + + if (vcpu->kvm->arch.pmuver == 0xf) + return -ENODEV; + switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; @@ -828,9 +934,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - if (get_user(irq, uaddr)) return -EFAULT; @@ -848,6 +951,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) vcpu->arch.pmu.irq_num = irq; return 0; } + case KVM_ARM_VCPU_PMU_V3_FILTER: { + struct kvm_pmu_event_filter __user *uaddr; + struct kvm_pmu_event_filter filter; + int nr_events; + + nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (((u32)filter.base_event + filter.nevents) > nr_events || + (filter.action != KVM_PMU_EVENT_ALLOW && + filter.action != KVM_PMU_EVENT_DENY)) + return -EINVAL; + + mutex_lock(&vcpu->kvm->lock); + + if (!vcpu->kvm->arch.pmu_filter) { + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + if (!vcpu->kvm->arch.pmu_filter) { + mutex_unlock(&vcpu->kvm->lock); + return -ENOMEM; + } + + /* + * The default depends on the first applied filter. + * If it allows events, the default is to deny. + * Conversely, if the first filter denies a set of + * events, the default is to allow. + */ + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events); + else + bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events); + } + + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + else + bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + + mutex_unlock(&vcpu->kvm->lock); + + return 0; + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -884,6 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: + case KVM_ARM_VCPU_PMU_V3_FILTER: if (kvm_arm_support_pmu_v3() && test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) return 0; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 3c224162b3dd..faf32a44ba04 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) */ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); - if (!kvm_pmu_switch_needed(attr)) + if (!ctx || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) */ void kvm_clr_pmu_events(u32 clr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + + if (!ctx) + return; ctx->pmu_events.events_host &= ~clr; ctx->pmu_events.events_guest &= ~clr; @@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) return; preempt_disable(); - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; @@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) if (!has_vhe()) return; - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f6e8b4a75cbb..f32490229a4c 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -335,7 +335,7 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int ipa_max, pa_max, va_max, parange, tgran_2; + unsigned int parange, tgran_2; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); @@ -372,39 +372,11 @@ int kvm_set_ipa_limit(void) break; } - pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); - - /* Clamp the IPA limit to the PA size supported by the kernel */ - ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; - /* - * Since our stage2 table is dependent on the stage1 page table code, - * we must always honor the following condition: - * - * Number of levels in Stage1 >= Number of levels in Stage2. - * - * So clamp the ipa limit further down to limit the number of levels. - * Since we can concatenate upto 16 tables at entry level, we could - * go upto 4bits above the maximum VA addressable with the current - * number of levels. - */ - va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; - va_max += 4; - - if (va_max < ipa_max) - ipa_max = va_max; - - /* - * If the final limit is lower than the real physical address - * limit of the CPUs, report the reason. - */ - if (ipa_max < pa_max) - pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", - (va_max < pa_max) ? "Virtual" : "Physical"); - - WARN(ipa_max < KVM_PHYS_SHIFT, - "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); - kvm_ipa_limit = ipa_max; - kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); + WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, + "KVM IPA Size Limit (%d bits) is smaller than default size\n", + kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9ca270603980..d9117bc56237 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - if (!(p->Op2 & 1)) - pmceid = read_sysreg(pmceid0_el0); - else - pmceid = read_sysreg(pmceid1_el0); + pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); p->regval = pmceid; diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index b13a9e3f99dd..f38c40a76251 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v) return 0; } -static const struct seq_operations vgic_debug_seq_ops = { +static const struct seq_operations vgic_debug_sops = { .start = vgic_debug_start, .next = vgic_debug_next, .stop = vgic_debug_stop, .show = vgic_debug_show }; -static int debug_open(struct inode *inode, struct file *file) -{ - int ret; - ret = seq_open(file, &vgic_debug_seq_ops); - if (!ret) { - struct seq_file *seq; - /* seq_open will have modified file->private_data */ - seq = file->private_data; - seq->private = inode->i_private; - } - - return ret; -}; - -static const struct file_operations vgic_debug_fops = { - .owner = THIS_MODULE, - .open = debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; +DEFINE_SEQ_ATTRIBUTE(vgic_debug); void vgic_debug_init(struct kvm *kvm) { diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 76e2d85789ed..9cdf39a94a63 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (likely(cpu_if->vgic_sre)) kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) vgic_v3_vmcr_sync(vcpu); - kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 825d337a505a..24f3d0f9996b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -341,7 +341,7 @@ struct kvm_mips_tlb { #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { void *guest_ebase; - int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + int (*vcpu_run)(struct kvm_vcpu *vcpu); /* Host registers preserved across guest mode execution */ unsigned long host_stack; @@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index fd716942e302..832475bf2055 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg) * Assemble the start of the vcpu_run function to run a guest VCPU. The function * conforms to the following prototype: * - * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * int vcpu_run(struct kvm_vcpu *vcpu); * * The exit from the guest and return to the caller is handled by the code * generated by kvm_mips_build_ret_to_host(). @@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr) unsigned int i; /* - * A0: run - * A1: vcpu + * A0: vcpu */ /* k0/k1 not being used in host kernel context */ @@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr) kvm_mips_build_save_scratch(&p, V1, K1); /* VCPU scratch register has pointer to vcpu */ - UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]); /* Offset into vcpu->arch */ - UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch)); /* * Save the host stack to VCPU, used for exception processing @@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr) /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); - - /* Restore run (vcpu->run) */ - UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1); + UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process @@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr) * with this in the kernel */ uasm_i_move(&p, A0, S0); - uasm_i_move(&p, A1, S1); UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); uasm_i_jalr(&p, RA, T9); UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); @@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr) * guest, reload k1 */ - uasm_i_move(&p, K1, S1); + uasm_i_move(&p, K1, S0); UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* @@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr) { u32 *p = addr; - /* Put the saved pointer to vcpu (s1) back into the scratch register */ - UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + /* Put the saved pointer to vcpu (s0) back into the scratch register */ + UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* Load up the Guest EBASE to minimize the window where BEV is set */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0c50ac444222..3d6a7f5827b1 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1199,8 +1199,9 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; u32 __user *opc = (u32 __user *) vcpu->arch.pc; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index f8cba51e1054..0788c00d7e94 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu) */ kvm_mips_suspend_mm(cpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); /* We may have migrated while handling guest exits */ cpu = smp_processor_id(); diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index c299e5d6d69c..2ffbe9264a31 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_tlb(vcpu, cpu); kvm_vz_vcpu_load_wired(vcpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); kvm_vz_vcpu_save_wired(vcpu); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 10ded83414de..d67a470e95a3 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -326,6 +326,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_xics *xics_device; struct kvmppc_xive *xive; /* Current XIVE device in use */ struct { struct kvmppc_xive *native; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 49db50d1db04..44bf567b6589 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, @@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) #ifdef CONFIG_KVM_XICS /* - * Free the XIVE devices which are not directly freed by the + * Free the XIVE and XICS devices which are not directly freed by the * device 'release' method */ kfree(kvm->arch.xive_devices.native); kvm->arch.xive_devices.native = NULL; kfree(kvm->arch.xive_devices.xics_on_xive); kvm->arch.xive_devices.xics_on_xive = NULL; + kfree(kvm->arch.xics_device); + kvm->arch.xics_device = NULL; #endif /* CONFIG_KVM_XICS */ } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 22a677b18695..bb35490400e9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, return __radix_pte_update(ptep, clr, set); } -void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, +static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, pte_t *ptep, pte_t pte) { radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 1a529df0ab44..8da93fdfa59e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *siter; struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; - int ret = -ENOMEM; + int ret; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) @@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, return ret; } -long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, +static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, unsigned long entry, unsigned long ua, enum dma_data_direction dir) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ac6ac192b8bb..470e7c518a10 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, return ret; } -extern void iommu_tce_kill_rm(struct iommu_table *tbl, +static void iommu_tce_kill_rm(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { if (tbl->it_ops->tce_kill) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3bd3118c7633..e3b1839fc251 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -111,7 +111,7 @@ module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); #ifdef CONFIG_KVM_XICS -static struct kernel_param_ops module_param_ops = { +static const struct kernel_param_ops module_param_ops = { .set = param_set_int, .get = param_get_int, }; @@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_psscr = mfspr(SPRN_PSSCR); unsigned long host_pidr = mfspr(SPRN_PID); + /* + * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, + * so set HDICE before writing HDEC. + */ + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE); + isync(); + hdec = time_limit - mftb(); - if (hdec < 0) + if (hdec < 0) { + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + isync(); return BOOK3S_INTERRUPT_HV_DECREMENTER; + } mtspr(SPRN_HDEC, hdec); if (vc->tb_offset) { @@ -3565,7 +3575,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, * Virtual-mode guest entry for POWER9 and later when the host and * guest are both using the radix MMU. The LPIDR has already been set. */ -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, +static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -3579,7 +3589,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, dec = mfspr(SPRN_DEC); tb = mftb(); - if (dec < 512) + if (dec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; local_paca->kvm_hstate.dec_expires = dec + tb; if (local_paca->kvm_hstate.dec_expires < time_limit) @@ -5257,6 +5267,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, case KVM_PPC_ALLOCATE_HTAB: { u32 htab_order; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries()) { + r = -EOPNOTSUPP; + break; + } + r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 59822cba454d..327417d79eac 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -58,13 +58,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* * Put whatever is in the decrementer into the * hypervisor decrementer. + * Because of a hardware deviation in P8 and P9, + * we need to set LPCR[HDICE] before writing HDEC. */ -BEGIN_FTR_SECTION ld r5, HSTATE_KVM_VCORE(r13) ld r6, VCORE_KVM(r5) ld r9, KVM_HOST_LPCR(r6) - andis. r9, r9, LPCR_LD@h -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ori r8, r9, LPCR_HDICE + mtspr SPRN_LPCR, r8 + isync + andis. r0, r9, LPCR_LD@h mfspr r8,SPRN_DEC mftb r7 BEGIN_FTR_SECTION diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 6822d23a2da4..33b58549a9aa 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) kvmhv_set_nested_ptbl(gp); } -struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) { struct kvm_nested_guest *gp; long shadow_lpid; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d7e5610731a..c2c9c733f359 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return ics_rm_eoi(vcpu, irq); } -unsigned long eoi_rc; +static unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 88fac22fbf09..b1fefa63e125 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) #endif } -void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) +static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 381bf8dea193..5fee5a11550d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return -ENXIO; } -static void kvmppc_xics_free(struct kvm_device *dev) +/* + * Called when device fd is closed. kvm->lock is held. + */ +static void kvmppc_xics_release(struct kvm_device *dev) { struct kvmppc_xics *xics = dev->private; int i; struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + + pr_devel("Releasing xics device\n"); + + /* + * Since this is the device release function, we know that + * userspace does not have any open fd referring to the + * device. Therefore there can not be any of the device + * attribute set/get functions being executed concurrently, + * and similarly, the connect_vcpu and set/clr_mapped + * functions also cannot be being executed. + */ debugfs_remove(xics->dentry); + /* + * We should clean up the vCPU interrupt presenters first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + /* + * Take vcpu->mutex to ensure that no one_reg get/set ioctl + * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently. + * Holding the vcpu->mutex also means that execution is + * excluded for the vcpu until the ICP was freed. When the vcpu + * can execute again, vcpu->arch.icp and vcpu->arch.irq_type + * have been cleared and the vcpu will not be going into the + * XICS code anymore. + */ + mutex_lock(&vcpu->mutex); + kvmppc_xics_free_icp(vcpu); + mutex_unlock(&vcpu->mutex); + } + if (kvm) kvm->arch.xics = NULL; - for (i = 0; i <= xics->max_icsid; i++) + for (i = 0; i <= xics->max_icsid; i++) { kfree(xics->ics[i]); - kfree(xics); + xics->ics[i] = NULL; + } + /* + * A reference of the kvmppc_xics pointer is now kept under + * the xics_device pointer of the machine for reuse. It is + * freed when the VM is destroyed for now until we fix all the + * execution paths. + */ kfree(dev); } +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm) +{ + struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device; + struct kvmppc_xics *xics = *kvm_xics_device; + + if (!xics) { + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + *kvm_xics_device = xics; + } else { + memset(xics, 0, sizeof(*xics)); + } + + return xics; +} + static int kvmppc_xics_create(struct kvm_device *dev, u32 type) { struct kvmppc_xics *xics; struct kvm *kvm = dev->kvm; - int ret = 0; - xics = kzalloc(sizeof(*xics), GFP_KERNEL); + pr_devel("Creating xics for partition\n"); + + /* Already there ? */ + if (kvm->arch.xics) + return -EEXIST; + + xics = kvmppc_xics_get_device(kvm); if (!xics) return -ENOMEM; dev->private = xics; xics->dev = dev; xics->kvm = kvm; - - /* Already there ? */ - if (kvm->arch.xics) - ret = -EEXIST; - else - kvm->arch.xics = xics; - - if (ret) { - kfree(xics); - return ret; - } + kvm->arch.xics = xics; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206) && @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = { .name = "kvm-xics", .create = kvmppc_xics_create, .init = kvmppc_xics_init, - .destroy = kvmppc_xics_free, + .release = kvmppc_xics_release, .set_attr = xics_set_attr, .get_attr = xics_get_attr, .has_attr = xics_has_attr, @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, return -EPERM; if (xics->kvm != vcpu->kvm) return -EPERM; - if (vcpu->arch.irq_type) + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; r = kvmppc_xics_create_icp(vcpu, xcpu); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index bdea91df1497..d0c2db0e07fa 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private) return 0; } -static int xive_native_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, xive_native_debug_show, inode->i_private); -} - -static const struct file_operations xive_native_debug_fops = { - .open = xive_native_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(xive_native_debug); static void xive_native_debugfs_init(struct kvmppc_xive *xive) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3e1c9f08e302..b1abcb816439 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -ENOTSUPP; + return -EOPNOTSUPP; } void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5303dbc5c9bc..d44858b69353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -80,13 +80,14 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) -#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -132,7 +133,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 @@ -636,7 +637,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry2 *cpuid_entries; int maxphyaddr; int max_tdp_level; @@ -788,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { @@ -860,6 +876,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -961,8 +984,31 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; + + /* + * Whether the TDP MMU is enabled for this VM. This contains a + * snapshot of the TDP MMU module parameter from when the VM was + * created and remains unchanged for the life of the VM. If this is + * true, TDP MMU handler functions will run for various MMU + * operations. + */ + bool tdp_mmu_enabled; + + /* List of struct tdp_mmu_pages being used as roots */ + struct list_head tdp_mmu_roots; + /* List of struct tdp_mmu_pages not being used as roots */ + struct list_head tdp_mmu_pages; }; struct kvm_vm_stat { @@ -1069,7 +1115,7 @@ struct kvm_x86_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1143,7 +1189,12 @@ struct kvm_x86_ops { /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + /* + * Retrieve somewhat arbitrary exit information. Intended to be used + * only from within tracepoints to avoid VMREADs when tracing is off. + */ + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, @@ -1221,12 +1272,13 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); + void (*msr_filter_changed)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { @@ -1238,7 +1290,7 @@ struct kvm_x86_nested_ops { int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, @@ -1612,8 +1664,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_shared_msr(unsigned index, u32 msr); -int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index cf13f9e78585..71d630bb5e08 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -3,10 +3,54 @@ #define __SVM_H #include <uapi/asm/svm.h> - +#include <uapi/asm/kvm.h> + +/* + * 32-bit intercept words in the VMCB Control Area, starting + * at Byte offset 000h. + */ + +enum intercept_words { + INTERCEPT_CR = 0, + INTERCEPT_DR, + INTERCEPT_EXCEPTION, + INTERCEPT_WORD3, + INTERCEPT_WORD4, + INTERCEPT_WORD5, + MAX_INTERCEPT, +}; enum { - INTERCEPT_INTR, + /* Byte offset 000h (word 0) */ + INTERCEPT_CR0_READ = 0, + INTERCEPT_CR3_READ = 3, + INTERCEPT_CR4_READ = 4, + INTERCEPT_CR8_READ = 8, + INTERCEPT_CR0_WRITE = 16, + INTERCEPT_CR3_WRITE = 16 + 3, + INTERCEPT_CR4_WRITE = 16 + 4, + INTERCEPT_CR8_WRITE = 16 + 8, + /* Byte offset 004h (word 1) */ + INTERCEPT_DR0_READ = 32, + INTERCEPT_DR1_READ, + INTERCEPT_DR2_READ, + INTERCEPT_DR3_READ, + INTERCEPT_DR4_READ, + INTERCEPT_DR5_READ, + INTERCEPT_DR6_READ, + INTERCEPT_DR7_READ, + INTERCEPT_DR0_WRITE = 48, + INTERCEPT_DR1_WRITE, + INTERCEPT_DR2_WRITE, + INTERCEPT_DR3_WRITE, + INTERCEPT_DR4_WRITE, + INTERCEPT_DR5_WRITE, + INTERCEPT_DR6_WRITE, + INTERCEPT_DR7_WRITE, + /* Byte offset 008h (word 2) */ + INTERCEPT_EXCEPTION_OFFSET = 64, + /* Byte offset 00Ch (word 3) */ + INTERCEPT_INTR = 96, INTERCEPT_NMI, INTERCEPT_SMI, INTERCEPT_INIT, @@ -38,7 +82,8 @@ enum { INTERCEPT_TASK_SWITCH, INTERCEPT_FERR_FREEZE, INTERCEPT_SHUTDOWN, - INTERCEPT_VMRUN, + /* Byte offset 010h (word 4) */ + INTERCEPT_VMRUN = 128, INTERCEPT_VMMCALL, INTERCEPT_VMLOAD, INTERCEPT_VMSAVE, @@ -53,15 +98,18 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + /* Byte offset 014h (word 5) */ + INTERCEPT_INVLPGB = 160, + INTERCEPT_INVLPGB_ILLEGAL, + INTERCEPT_INVPCID, + INTERCEPT_MCOMMIT, + INTERCEPT_TLBSYNC, }; struct __attribute__ ((__packed__)) vmcb_control_area { - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - u8 reserved_1[40]; + u32 intercepts[MAX_INTERCEPT]; + u32 reserved_1[15 - MAX_INTERCEPT]; u16 pause_filter_thresh; u16 pause_filter_count; u64 iopm_base_pa; @@ -287,32 +335,6 @@ struct vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_CR0_READ 0 -#define INTERCEPT_CR3_READ 3 -#define INTERCEPT_CR4_READ 4 -#define INTERCEPT_CR8_READ 8 -#define INTERCEPT_CR0_WRITE (16 + 0) -#define INTERCEPT_CR3_WRITE (16 + 3) -#define INTERCEPT_CR4_WRITE (16 + 4) -#define INTERCEPT_CR8_WRITE (16 + 8) - -#define INTERCEPT_DR0_READ 0 -#define INTERCEPT_DR1_READ 1 -#define INTERCEPT_DR2_READ 2 -#define INTERCEPT_DR3_READ 3 -#define INTERCEPT_DR4_READ 4 -#define INTERCEPT_DR5_READ 5 -#define INTERCEPT_DR6_READ 6 -#define INTERCEPT_DR7_READ 7 -#define INTERCEPT_DR0_WRITE (16 + 0) -#define INTERCEPT_DR1_WRITE (16 + 1) -#define INTERCEPT_DR2_WRITE (16 + 2) -#define INTERCEPT_DR3_WRITE (16 + 3) -#define INTERCEPT_DR4_WRITE (16 + 4) -#define INTERCEPT_DR5_WRITE (16 + 5) -#define INTERCEPT_DR6_WRITE (16 + 6) -#define INTERCEPT_DR7_WRITE (16 + 7) - #define SVM_EVTINJ_VEC_MASK 0xff #define SVM_EVTINJ_TYPE_SHIFT 8 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cd7de4b401fe..f8ba5289ecb0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -52,7 +52,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES) #define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT) #define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING) -#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP) +#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP) #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC) #define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID) #define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0780f97c1850..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,6 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { +#define KVM_MSR_FILTER_READ (1 << 0) +#define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index a7a3403645e5..f1d8307454e0 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -77,6 +77,7 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 @@ -182,6 +183,7 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1c0f2560a41c..7f57ede3cb8e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -975,7 +975,7 @@ void arch_haltpoll_disable(unsigned int cpu) if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; - /* Enable guest halt poll disables host halt poll */ + /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbd5bd7a945a..f92dfd8ef10d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -66,6 +66,7 @@ config KVM_WERROR default y if X86_64 && !KASAN # We use the dependency on !COMPILE_TEST to not be enabled # blindly in allmodconfig or allyesconfig configurations + depends on KVM depends on (X86_64 && !KASAN) || !COMPILE_TEST depends on EXPERT help diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4a3081e9f4b5..b804444e16d4 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,9 +15,11 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o -kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7456f9ad424b..06a278b3701d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,7 +54,24 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -static int kvm_check_cpuid(struct kvm_vcpu *vcpu) +static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *e; + int i; + + for (i = 0; i < nent; i++) { + e = &entries[i]; + + if (e->function == function && (e->index == index || + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + return e; + } + + return NULL; +} + +static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; @@ -62,7 +79,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, 0); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -107,6 +124,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) @@ -121,8 +145,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); - best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -146,7 +168,9 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_x86_ops.update_exception_bitmap(vcpu); + + /* Invoke the vendor callback only after the above state is updated. */ + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); } static int is_efer_nx(void) @@ -186,7 +210,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, @@ -194,46 +217,53 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry __user *entries) { int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; + struct kvm_cpuid_entry *e = NULL; + struct kvm_cpuid_entry2 *e2 = NULL; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; + return -E2BIG; + if (cpuid->nent) { - cpuid_entries = vmemdup_user(entries, - array_size(sizeof(struct kvm_cpuid_entry), - cpuid->nent)); - if (IS_ERR(cpuid_entries)) { - r = PTR_ERR(cpuid_entries); - goto out; + e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + if (IS_ERR(e)) + return PTR_ERR(e); + + e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); + if (!e2) { + r = -ENOMEM; + goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; + e2[i].function = e[i].function; + e2[i].eax = e[i].eax; + e2[i].ebx = e[i].ebx; + e2[i].ecx = e[i].ecx; + e2[i].edx = e[i].edx; + e2[i].index = 0; + e2[i].flags = 0; + e2[i].padding[0] = 0; + e2[i].padding[1] = 0; + e2[i].padding[2] = 0; } - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - kvfree(cpuid_entries); - goto out; + kvfree(e2); + goto out_free_cpuid; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); - kvfree(cpuid_entries); -out: +out_free_cpuid: + kvfree(e); + return r; } @@ -241,26 +271,32 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { + struct kvm_cpuid_entry2 *e2 = NULL; int r; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + return -E2BIG; + + if (cpuid->nent) { + e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + if (IS_ERR(e2)) + return PTR_ERR(e2); + } + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - goto out; + kvfree(e2); + return r; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); -out: - return r; + + return 0; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -941,17 +977,8 @@ out_free: struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *e; - int i; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - - if (e->function == function && (e->index == index || - !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) - return e; - } - return NULL; + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 3a923ae15f2f..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include <asm/cpu.h> #include <asm/processor.h> +#include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -34,6 +35,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + struct cpuid_reg { u32 function; u32 index; @@ -308,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2f6510de6b0c..0d917eb70319 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3606,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) u64 tsc_aux = 0; if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) - return emulate_gp(ctxt, 0); + return emulate_ud(ctxt); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } @@ -3701,21 +3701,35 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r > 0) return emulate_gp(ctxt, 0); - return X86EMUL_CONTINUE; + return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; + + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 8c1e8334eff0..5c7c4060b45c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cfe83d4ae625..a889563ad02d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 35cca2e0c802..105e7859d1f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) +{ + kvm_lapic_set_reg(apic, APIC_DFR, val); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); +} + static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); @@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) +{ + apic_clear_irr(vec, vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { struct kvm_vcpu *vcpu; @@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) { - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); - } else + if (!apic_x2apic_mode(apic)) + kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); + else ret = 1; break; @@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || - !apic_lvtt_tscdeadline(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); @@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) @@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } +EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 754f29beb83e..4fb86e3a9dd3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5efc6081ca13..9c4a9c8e43d9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71aa3da2a0b7..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,10 +19,12 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -45,7 +47,6 @@ #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> -#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; @@ -104,45 +105,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +125,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,42 +132,11 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include <trace/events/kvm.h> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault and mmu.page_fault: - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; @@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - - access = mask & ACC_ALL; - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } @@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -796,12 +428,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -976,34 +587,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -1640,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1666,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1710,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } @@ -1769,13 +1362,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1919,12 +1507,26 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1973,12 +1575,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG @@ -2577,13 +2191,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2615,8 +2223,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,23 +2239,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); - } - return true; - } - if (is_mmio_spte(pte)) + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); + } + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2692,7 +2312,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -2885,8 +2505,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2946,132 +2566,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte = 0; - int ret = 0; + u64 spte; struct kvm_mmu_page *sp; + int ret; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) { + if (spte & PT_WRITABLE_MASK) kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= spte_shadow_dirty_mask(spte); - } - if (speculative) - spte = mark_spte_for_access_track(spte); - -set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3113,6 +2643,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3161,7 +2700,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } @@ -3239,8 +2778,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3248,6 +2788,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3272,7 +2814,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3285,14 +2834,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; - u64 spte = *it.sptep; + int level = *goal_levelp; - if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3302,26 +2849,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, ret; + int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -3329,7 +2882,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gfn, it.level, + &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) @@ -3341,7 +2896,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (account_disallowed_nx_lpage) + if (is_tdp && huge_page_disallowed && + req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } @@ -3349,6 +2905,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, write, level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + direct_pte_prefetch(vcpu, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -3479,21 +3038,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Return value: - * - true: let the vcpu to access on the same address again. - * - false: let the real page fault path to fix it. + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool fault_handled = false; + int ret = RET_PF_INVALID; u64 spte = 0ull; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) - return false; + return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3519,7 +3076,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * they are always ACC_ALL. */ if (is_access_allowed(error_code, spte)) { - fault_handled = true; + ret = RET_PF_SPURIOUS; break; } @@ -3562,11 +3119,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte, - new_spte); - if (fault_handled) + if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, + new_spte)) { + ret = RET_PF_FIXED; break; + } if (++retry_count > 4) { printk_once(KERN_WARNING @@ -3577,10 +3134,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, fault_handled); + spte, ret); walk_shadow_page_lockless_end(vcpu); - return fault_handled; + return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -3592,9 +3149,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + if (kvm_mmu_put_root(kvm, sp)) { + if (sp->tdp_mmu_page) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3603,6 +3164,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free) { + struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; @@ -3620,22 +3182,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&vcpu->kvm->mmu_lock); + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) - mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, - &invalid_list); + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, + mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->root_hpa = INVALID_PAGE; @@ -3643,8 +3204,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->root_pgd = 0; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3684,8 +3245,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (vcpu->kvm->arch.tdp_mmu_enabled) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) + return -ENOSPC; + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, + true); + if (!VALID_PAGE(root)) return -ENOSPC; vcpu->arch.mmu->root_hpa = root; @@ -3910,54 +3479,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -/* return true if reserved bit is detected on spte. */ -static bool -walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; - struct rsvd_bits_validate *rsvd_check; - int root, leaf; - bool reserved = false; + int leaf = vcpu->arch.mmu->root_level; + u64 spte; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), - leaf = root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; - leaf--; if (!is_shadow_present_pte(spte)) break; + } + + walk_shadow_page_lockless_end(vcpu); + + return leaf; +} + +/* return true if reserved bit is detected on spte. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL]; + struct rsvd_bits_validate *rsvd_check; + int root = vcpu->arch.mmu->root_level; + int leaf; + int level; + bool reserved = false; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { + *sptep = 0ull; + return reserved; + } + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + else + leaf = get_walk(vcpu, addr, sptes); + + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + + for (level = root; level >= leaf; level--) { + if (!is_shadow_present_pte(sptes[level - 1])) + break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, spte) | - __is_rsvd_bits_set(rsvd_check, spte, iterator.level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | + __is_rsvd_bits_set(rsvd_check, sptes[level - 1], + level); } - walk_shadow_page_lockless_end(vcpu); - if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root > leaf) { + for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[root - 1], root); - root--; - } + sptes[level - 1], level); } - *sptep = spte; + *sptep = sptes[leaf - 1]; + return reserved; } @@ -3969,7 +3566,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; - reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) return -EINVAL; @@ -4080,8 +3677,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool lpage_disallowed = exec && is_nx_huge_page_enabled(); bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -4092,16 +3687,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) return r; - if (lpage_disallowed) - max_level = PG_LEVEL_4K; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4118,8 +3713,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, - prefault, is_tdp && lpage_disallowed); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -4292,7 +3892,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); + /* + * If this is a direct root page, it doesn't have a write flooding + * count. Otherwise, clear the write flooding count. + */ + if (!new_role.direct) + __clear_sp_write_flooding_count( + to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -5400,7 +5006,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u32 base_role = vcpu->arch.mmu->mmu_role.base.word; entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && !((sp->role.word ^ base_role) & ~role_ign.word) && rmap_can_add(vcpu)) @@ -5450,13 +5056,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - WARN_ON(r == RET_PF_INVALID); + if (WARN_ON_ONCE(r == RET_PF_INVALID)) + return -EIO; } - if (r == RET_PF_RETRY) - return 1; if (r < 0) return r; + if (r != RET_PF_EMULATE) + return 1; /* * Before emulating the instruction, check if the error code @@ -5485,18 +5092,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: - /* - * On AMD platforms, under certain conditions insn_len may be zero on #NPF. - * This can happen if a guest gets a page-fault on data access but the HW - * table walker is not able to read the instruction page (e.g instruction - * page is not present in memory). In those cases we simply restart the - * guest, with the exception of AMD Erratum 1096 which is unrecoverable. - */ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) - return 1; - } - return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } @@ -5682,11 +5277,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; + mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + mmu->translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5712,7 +5313,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - uint i; int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; @@ -5726,25 +5326,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_pgd = 0; - vcpu->arch.root_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_pgd = 0; - vcpu->arch.guest_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; @@ -5841,6 +5429,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -5860,6 +5452,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + kvm_mmu_init_tdp_mmu(kvm); + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); @@ -5870,6 +5464,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; kvm_page_track_unregister_notifier(kvm, node); + + kvm_mmu_uninit_tdp_mmu(kvm); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -5877,6 +5473,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; + bool flush; spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5896,6 +5493,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } + if (kvm->arch.tdp_mmu_enabled) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); + } + spin_unlock(&kvm->mmu_lock); } @@ -5914,6 +5517,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); spin_unlock(&kvm->mmu_lock); /* @@ -5977,6 +5582,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); spin_unlock(&kvm->mmu_lock); } @@ -6002,6 +5610,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); spin_unlock(&kvm->mmu_lock); /* @@ -6023,6 +5633,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6037,6 +5649,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6062,6 +5676,10 @@ restart: } kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -6357,7 +5975,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; - while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + for ( ; to_zap; --to_zap) { + if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + break; + /* * We use a separate list instead of just using active_mmu_pages * because the number of lpage_disallowed pages is expected to @@ -6367,15 +5988,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + if (sp->tdp_mmu_page) + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } - if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (to_zap) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 3acf3b8eb469..bfc6389edc28 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -3,9 +3,23 @@ #define __KVM_X86_MMU_INTERNAL_H #include <linux/types.h> - +#include <linux/kvm_host.h> #include <asm/kvm_host.h> +#undef MMU_DEBUG + +#ifdef MMU_DEBUG +extern bool dbg; + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define MMU_WARN_ON(x) WARN_ON(x) +#else +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) +#define MMU_WARN_ON(x) do { } while (0) +#endif + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -41,8 +55,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; + + bool tdp_mmu_page; }; +extern struct kmem_cache *mmu_page_header_cache; + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -55,9 +73,77 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + +bool is_nx_huge_page_enabled(void); +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync); + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages); + +static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + BUG_ON(!sp->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++sp->root_count; +} + +static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held(&kvm->mmu_lock); + --sp->root_count; + + return !sp->root_count; +} + +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 9d15bc0c535b..213699b27b44 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), + TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte), + TP_ARGS(sptep, gfn, spte), TP_STRUCT__entry( __field(void *, sptep) @@ -215,8 +215,8 @@ TRACE_EVENT( TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; + __entry->access = spte & ACC_ALL; + __entry->gen = get_mmio_spte_generation(spte); ), TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, @@ -244,14 +244,11 @@ TRACE_EVENT( __entry->access) ); -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - TRACE_EVENT( fast_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + u64 *sptep, u64 old_spte, int ret), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -260,7 +257,7 @@ TRACE_EVENT( __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) - __field(bool, retry) + __field(int, ret) ), TP_fast_assign( @@ -270,7 +267,7 @@ TRACE_EVENT( __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; - __entry->retry = retry; + __entry->ret = ret; ), TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" @@ -278,7 +275,7 @@ TRACE_EVENT( __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) + __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED ) ); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4dd6b1e5b8cf..50e268eb8e1a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) + struct guest_walker *gw, u32 error_code, + int max_level, kvm_pfn_t pfn, bool map_writable, + bool prefault) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write_fault = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; + int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -690,10 +694,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, + &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + if (it.level == level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -704,13 +710,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (huge_page_disallowed && req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, it.level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -738,7 +747,7 @@ out_gpte_changed: */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, + struct guest_walker *walker, bool user_fault, bool *write_fault_to_shadow_pgtable) { int level; @@ -776,15 +785,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; + bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -825,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (lpage_disallowed || is_self_change_mapping) + if (is_self_change_mapping) max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -869,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: @@ -895,6 +902,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + u64 old_spte; int level; u64 *sptep; @@ -917,7 +925,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sptep = iterator.sptep; sp = sptep_to_sp(sptep); - if (is_last_spte(*sptep, level)) { + old_spte = *sptep; + if (is_last_spte(old_spte, level)) { pt_element_t gpte; gpa_t pte_gpa; @@ -927,7 +936,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); + if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c new file mode 100644 index 000000000000..d9c5665a55e9 --- /dev/null +++ b/arch/x86/kvm/mmu/spte.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * Macros and functions to access KVM PTEs (also known as SPTEs) + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2020 Red Hat, Inc. and/or its affiliates. + */ + + +#include <linux/kvm_host.h> +#include "mmu.h" +#include "mmu_internal.h" +#include "x86.h" +#include "spte.h" + +#include <asm/e820/api.h> + +u64 __read_mostly shadow_nx_mask; +u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_accessed_mask; +u64 __read_mostly shadow_dirty_mask; +u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_access_mask; +u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_mask; +u64 __read_mostly shadow_acc_track_mask; + +u64 __read_mostly shadow_nonpresent_or_rsvd_mask; +u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +u8 __read_mostly shadow_phys_bits; + +static u64 generation_mmio_spte_mask(u64 gen) +{ + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; +} + +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +{ + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; + u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; + + access &= shadow_mmio_access_mask; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; + + return mask; +} + +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); + + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); +} + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) +{ + u64 spte = 0; + int ret = 0; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; + + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PG_LEVEL_4K) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(old_spte)) + goto out; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret |= SET_SPTE_WRITE_PROTECTED_PT; + pte_access &= ~ACC_WRITE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + } + } + + if (pte_access & ACC_WRITE_MASK) + spte |= spte_shadow_dirty_mask(spte); + + if (speculative) + spte = mark_spte_for_access_track(spte); + +out: + *new_spte = spte; + return ret; +} + +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +u64 mark_spte_for_access_track(u64 spte) +{ + if (spte_ad_enabled(spte)) + return spte & ~shadow_accessed_mask; + + if (is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + + return spte; +} + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +{ + BUG_ON((u64)(unsigned)access_mask != access_mask); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + shadow_mmio_access_mask = access_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask, u64 me_mask) +{ + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; + shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +void kvm_mmu_reset_all_pte_masks(void) +{ + u8 low_phys_bits; + + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; + + shadow_phys_bits = kvm_get_shadow_phys_bits(); + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. + */ + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } + + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); +} diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h new file mode 100644 index 000000000000..4ecf40e0b8fe --- /dev/null +++ b/arch/x86/kvm/mmu/spte.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_X86_MMU_SPTE_H +#define KVM_X86_MMU_SPTE_H + +#include "mmu_internal.h" + +#define PT_FIRST_AVAIL_BITS_SHIFT 10 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +/* + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + +extern u64 __read_mostly shadow_nx_mask; +extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_accessed_mask; +extern u64 __read_mostly shadow_dirty_mask; +extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_access_mask; +extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +extern u64 __read_mostly shadow_acc_track_mask; + +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline bool is_mmio_spte(u64 spte) +{ + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; +} + +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + +static inline int is_shadow_present_pte(u64 pte) +{ + return (pte != 0) && !is_mmio_spte(pte); +} + +static inline int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static inline int is_last_spte(u64 pte, int level) +{ + if (level == PG_LEVEL_4K) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static inline bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + +static inline kvm_pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static inline bool is_accessed_spte(u64 spte) +{ + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); +} + +static inline bool is_dirty_spte(u64 spte) +{ + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; +} + +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + +static inline u64 get_mmio_spte_generation(u64 spte) +{ + u64 gen; + + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; +} + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 mark_spte_for_access_track(u64 spte); +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); + +void kvm_mmu_reset_all_pte_masks(void); + +#endif diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c new file mode 100644 index 000000000000..87b7e16911db --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu_internal.h" +#include "tdp_iter.h" +#include "spte.h" + +/* + * Recalculates the pointer to the SPTE for the current GFN and level and + * reread the SPTE. + */ +static void tdp_iter_refresh_sptep(struct tdp_iter *iter) +{ + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + iter->old_spte = READ_ONCE(*iter->sptep); +} + +static gfn_t round_gfn_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + +/* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure + * rooted at root_pt, starting with the walk to translate goal_gfn. + */ +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn) +{ + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->goal_gfn = goal_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* + * Given an SPTE and its level, returns a pointer containing the host virtual + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +u64 *spte_to_child_pt(u64 spte, int level) +{ + /* + * There's no child entry if this entry isn't present or is a + * last-level entry. + */ + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + + return __va(spte_to_pfn(spte) << PAGE_SHIFT); +} + +/* + * Steps down one level in the paging structure towards the goal GFN. Returns + * true if the iterator was able to step down a level, false otherwise. + */ +static bool try_step_down(struct tdp_iter *iter) +{ + u64 *child_pt; + + if (iter->level == iter->min_level) + return false; + + /* + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ + iter->old_spte = READ_ONCE(*iter->sptep); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) + return false; + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Steps to the next entry in the current page table, at the current page table + * level. The next entry could point to a page backing guest memory or another + * page table, or it could be non-present. Returns true if the iterator was + * able to step to the next entry in the page table, false if the iterator was + * already at the end of the current page table. + */ +static bool try_step_side(struct tdp_iter *iter) +{ + /* + * Check if the iterator is already at the end of the current page + * table. + */ + if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (PT64_ENT_PER_PAGE - 1)) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->goal_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + + return true; +} + +/* + * Tries to traverse back up a level in the paging structure so that the walk + * can continue from the next entry in the parent page table. Returns true on a + * successful step up, false if already in the root page. + */ +static bool try_step_up(struct tdp_iter *iter) +{ + if (iter->level == iter->root_level) + return false; + + iter->level++; + iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Step to the next SPTE in a pre-order traversal of the paging structure. + * To get to the next SPTE, the iterator either steps down towards the goal + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a + * highter GFN. + * + * The basic algorithm is as follows: + * 1. If the current SPTE is a non-last-level SPTE, step down into the page + * table it points to. + * 2. If the iterator cannot step down, it will try to step to the next SPTE + * in the current page of the paging structure. + * 3. If the iterator cannot step to the next entry in the current page, it will + * try to step up to the parent paging structure page. In this case, that + * SPTE will have already been visited, and so the iterator must also step + * to the side again. + */ +void tdp_iter_next(struct tdp_iter *iter) +{ + if (try_step_down(iter)) + return; + + do { + if (try_step_side(iter)) + return; + } while (try_step_up(iter)); + iter->valid = false; +} + +/* + * Restart the walk over the paging structure from the root, starting from the + * highest gfn the iterator had previously reached. Assumes that the entire + * paging structure, except the root page, may have been completely torn down + * and rebuilt. + */ +void tdp_iter_refresh_walk(struct tdp_iter *iter) +{ + gfn_t goal_gfn = iter->goal_gfn; + + if (iter->gfn > goal_gfn) + goal_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, goal_gfn); +} + +u64 *tdp_iter_root_pt(struct tdp_iter *iter) +{ + return iter->pt_path[iter->root_level - 1]; +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h new file mode 100644 index 000000000000..47170d0dc98e --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_ITER_H +#define __KVM_X86_MMU_TDP_ITER_H + +#include <linux/kvm_host.h> + +#include "mmu.h" + +/* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +struct tdp_iter { + /* + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ + gfn_t goal_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ + u64 *sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ + int root_level; + /* The lowest level the iterator should traverse to */ + int min_level; + /* The iterator's current level within the paging structure */ + int level; + /* A snapshot of the value at sptep */ + u64 old_spte; + /* + * Whether the iterator has a valid state. This will be false if the + * iterator walks off the end of the paging structure. + */ + bool valid; +}; + +/* + * Iterates over every SPTE mapping the GFN range [start, end) in a + * preorder traversal. + */ +#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, min_level, start); \ + iter.valid && iter.gfn < end; \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + +u64 *spte_to_child_pt(u64 pte, int level); + +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn); +void tdp_iter_next(struct tdp_iter *iter); +void tdp_iter_refresh_walk(struct tdp_iter *iter); +u64 *tdp_iter_root_pt(struct tdp_iter *iter); + +#endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c new file mode 100644 index 000000000000..e246d71b8ea2 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -0,0 +1,1157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu.h" +#include "mmu_internal.h" +#include "mmutrace.h" +#include "tdp_iter.h" +#include "tdp_mmu.h" +#include "spte.h" + +#ifdef CONFIG_X86_64 +static bool __read_mostly tdp_mmu_enabled = false; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +#endif + +static bool is_tdp_mmu_enabled(void) +{ +#ifdef CONFIG_X86_64 + return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +#else + return false; +#endif /* CONFIG_X86_64 */ +} + +/* Initializes the TDP MMU for the VM, if enabled. */ +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +{ + if (!is_tdp_mmu_enabled()) + return; + + /* This should not be changed for the lifetime of the VM. */ + kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); +} + +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) +{ + if (!kvm->arch.tdp_mmu_enabled) + return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); +} + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + sp = to_shadow_page(hpa); + + return sp->tdp_mmu_page && sp->root_count; +} + +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield); + +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + + lockdep_assert_held(&kvm->mmu_lock); + + WARN_ON(root->root_count); + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + zap_gfn_range(kvm, root, 0, max_gfn, false); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +} + +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, + int level) +{ + union kvm_mmu_page_role role; + + role = vcpu->arch.mmu->mmu_role.base; + role.level = level; + role.direct = true; + role.gpte_is_8_bytes = true; + role.access = ACC_ALL; + + return role; +} + +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + sp->role.word = page_role_for_level(vcpu, level).word; + sp->gfn = gfn; + sp->tdp_mmu_page = true; + + return sp; +} + +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + + spin_lock(&kvm->mmu_lock); + + /* Check for an existing root before allocating a new one. */ + for_each_tdp_mmu_root(kvm, root) { + if (root->role.word == role.word) { + kvm_mmu_get_root(kvm, root); + spin_unlock(&kvm->mmu_lock); + return root; + } + } + + root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root->root_count = 1; + + list_add(&root->link, &kvm->arch.tdp_mmu_roots); + + spin_unlock(&kvm->mmu_lock); + + return root; +} + +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root; + + root = get_tdp_mmu_vcpu_root(vcpu); + if (!root) + return INVALID_PAGE; + + return __pa(root->spt); +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + +static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) + return; + + if (is_accessed_spte(old_spte) && + (!is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); +} + +static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed; + struct kvm_memory_slot *slot; + + if (level > PG_LEVEL_4K) + return; + + pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if ((!is_writable_pte(old_spte) || pfn_changed) && + is_writable_pte(new_spte)) { + slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); + mark_page_dirty_in_slot(slot, gfn); + } +} + +/** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance + * @as_id: the address space of the paging structure the SPTE was a part of + * @gfn: the base GFN that was mapped by the SPTE + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ +static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + u64 *pt; + struct kvm_mmu_page *sp; + u64 old_child_spte; + int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); + WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level)); + + /* + * If this warning were to trigger it would indicate that there was a + * missing MMU notifier or a race with some notifier handler. + * A present, leaf SPTE should never be directly replaced with another + * present leaf SPTE pointing to a differnt PFN. A notifier handler + * should be zapping the SPTE before the main MM's page table is + * changed, or the SPTE should be zeroed, and the TLBs flushed by the + * thread before replacement. + */ + if (was_leaf && is_leaf && pfn_changed) { + pr_err("Invalid SPTE change: cannot replace a present leaf\n" + "SPTE with another present leaf SPTE mapping a\n" + "different PFN!\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + + /* + * Crash the host to prevent error propagation and guest data + * courruption. + */ + BUG(); + } + + if (old_spte == new_spte) + return; + + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ + * removed. In that case, there is nothing to do here. + */ + if (!was_present && !is_present) { + /* + * If this change does not involve a MMIO SPTE, it is + * unexpected. Log the change, though it should not impact the + * guest since both the former and current SPTEs are nonpresent. + */ + if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" + "should not be replaced with another,\n" + "different nonpresent SPTE, unless one or both\n" + "are MMIO SPTEs.\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + return; + } + + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_dirty_spte(new_spte) || pfn_changed)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + /* + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) { + pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + + list_del(&sp->link); + + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, as_id, + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); + } +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} + +static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + WRITE_ONCE(*iter->sptep, new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); + if (record_dirty_log) + handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + iter->old_spte, new_spte, + iter->level); +} + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); +} + +static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); +} + +static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); +} + +#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ + for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + +#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _root, _start, _end) \ + if (!is_shadow_present_pte(_iter.old_spte) || \ + !is_last_spte(_iter.old_spte, _iter.level)) \ + continue; \ + else + +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + +/* + * Flush the TLB if the process should drop kvm->mmu_lock. + * Return whether the caller still needs to flush the tlb. + */ +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return false; + } else { + return true; + } +} + +static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + } +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + * If can_yield is true, will release the MMU lock and reschedule if the + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the + * operation can cause a soft lockup. + */ +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield) +{ + struct tdp_iter iter; + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a + * lower level. + */ + if ((iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + if (can_yield) + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + else + flush_needed = true; + } + return flush_needed; +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_mmu_page *root; + bool flush = false; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + flush |= zap_gfn_range(kvm, root, start, end, true); + + kvm_mmu_put_root(kvm, root); + } + + return flush; +} + +void kvm_tdp_mmu_zap_all(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + bool flush; + + flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + } else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; + else + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_page *sp; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); + if (huge_page_disallowed && req_level >= iter.level) + account_huge_nx_page(vcpu->kvm, sp); + + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + if (WARN_ON(iter.level != level)) + return RET_PF_RETRY; + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} + +static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end, unsigned long data, + int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long data)) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + struct kvm_mmu_page *root; + int ret = 0; + int as_id; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + as_id = kvm_mmu_page_as_id(root); + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } + + kvm_mmu_put_root(kvm, root); + } + + return ret; +} + +static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) +{ + return zap_gfn_range(kvm, root, start, end, false); +} + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + zap_gfn_range_hva_wrapper); +} + +/* + * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero + * if any of the GFNs in the range have been accessed. + */ +static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long unused) +{ + struct tdp_iter iter; + int young = 0; + u64 new_spte = 0; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the + * pte. + */ + if (!is_accessed_spte(iter.old_spte)) + continue; + + new_spte = iter.old_spte; + + if (spte_ad_enabled(new_spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)&new_spte); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + + new_spte = mark_spte_for_access_track(new_spte); + } + new_spte &= ~shadow_dirty_mask; + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; + } + + return young; +} + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + age_gfn_range); +} + +static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long unused2) +{ + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + if (is_accessed_spte(iter.old_spte)) + return 1; + + return 0; +} + +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, + test_age_gfn); +} + +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long data) +{ + struct tdp_iter iter; + pte_t *ptep = (pte_t *)data; + kvm_pfn_t new_pfn; + u64 new_spte; + int need_flush = 0; + + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); + + tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + if (iter.level != PG_LEVEL_4K) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + break; + + tdp_mmu_set_spte(kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + + if (!pte_write(*ptep)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + iter.old_spte, new_pfn); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + } + + need_flush = 1; + } + + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + + return 0; +} + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, + (unsigned long)host_ptep, + set_tdp_spte); +} + +/* + * Remove write access from all the SPTEs mapping GFNs [start, end). If + * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int min_level) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Remove write access from all the SPTEs mapping GFNs in the memslot. Will + * only affect leaf SPTEs down to min_level. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages, min_level); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn, unsigned long mask, bool wrprot) +{ + struct tdp_iter iter; + u64 new_spte; + + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) + break; + + if (iter.level > PG_LEVEL_4K || + !(mask & (1UL << (iter.gfn - gfn)))) + continue; + + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + + mask &= ~(1UL << (iter.gfn - gfn)); + } +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); + } +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + new_spte = iter.old_spte | shadow_dirty_mask; + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + + return spte_set; +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + return spte_set; +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +static void zap_collapsible_spte_range(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + kvm_pfn_t pfn; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + is_last_spte(iter.old_spte, iter.level)) + continue; + + pfn = spte_to_pfn(iter.old_spte); + if (kvm_is_reserved_pfn(pfn) || + !PageTransCompoundMap(pfn_to_page(pfn))) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + zap_collapsible_spte_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; + + new_spte = iter.old_spte & + ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + } + + return spte_set; +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + spte_set |= write_protect_gfn(kvm, root, gfn); + } + return spte_set; +} + +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + int leaf = vcpu->arch.mmu->shadow_root_level; + gfn_t gfn = addr >> PAGE_SHIFT; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + + return leaf; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h new file mode 100644 index 000000000000..556e065503f6 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_MMU_H +#define __KVM_X86_MMU_TDP_MMU_H + +#include <linux/kvm_host.h> + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault); + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep); + +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level); +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + struct kvm_memory_slot *slot); +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot); +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); + +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +#endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ac830cd50830..8c550999ace0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT); + p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!p_page) goto free_avic; kvm_svm->avic_physical_id_table_page = p_page; - clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT); + l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!l_page) goto free_avic; kvm_svm->avic_logical_id_table_page = l_page; - clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: @@ -868,6 +866,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * - Tell IOMMU to use legacy mode for this interrupt. * - Retrieve ga_tag of prior interrupt remapping data. */ + pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 598a769f1961..9e4c226dbf7d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; + unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested.ctl; - svm->nested.host_intercept_exceptions = h->intercept_exceptions; - - c->intercept_cr = h->intercept_cr; - c->intercept_dr = h->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions; - c->intercept = h->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ - c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); - c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + vmcb_clr_intercept(c, INTERCEPT_CR8_READ); + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); /* * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not * affect any interrupt we may want to inject; therefore, * interrupt window vmexits are irrelevant to L0. */ - c->intercept &= ~(1ULL << INTERCEPT_VINTR); + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* We don't want to see VMMCALLs from a nested guest */ - c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); - c->intercept_cr |= g->intercept_cr; - c->intercept_dr |= g->intercept_dr; - c->intercept_exceptions |= g->intercept_exceptions; - c->intercept |= g->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] |= g->intercepts[i]; } static void copy_vmcb_control_area(struct vmcb_control_area *dst, struct vmcb_control_area *from) { - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; - dst->intercept_exceptions = from->intercept_exceptions; - dst->intercept = from->intercept; + unsigned int i; + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; @@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) return false; if (control->asid == 0) @@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) { - bool nested_vmcb_lma; - if ((vmcb->save.efer & EFER_SVME) == 0) + bool vmcb12_lma; + + if ((vmcb12->save.efer & EFER_SVME) == 0) return false; - if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && - (vmcb->save.cr0 & X86_CR0_NW)) + if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) return false; - if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) return false; - nested_vmcb_lma = - (vmcb->save.efer & EFER_LME) && - (vmcb->save.cr0 & X86_CR0_PG); + vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!nested_vmcb_lma) { - if (vmcb->save.cr4 & X86_CR4_PAE) { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + if (!vmcb12_lma) { + if (vmcb12->save.cr4 & X86_CR4_PAE) { + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) return false; } else { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) return false; } } else { - if (!(vmcb->save.cr4 & X86_CR4_PAE) || - !(vmcb->save.cr0 & X86_CR0_PE) || - (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + if (!(vmcb12->save.cr4 & X86_CR4_PAE) || + !(vmcb12->save.cr0 & X86_CR0_PE) || + (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb->control); + return nested_vmcb_check_controls(&vmcb12->control); } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * EXIT_INT_INFO. */ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *nested_vmcb) + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, if (vcpu->arch.exception.has_error_code) { exit_int_info |= SVM_EVTINJ_VALID_ERR; - nested_vmcb->control.exit_int_info_err = + vmcb12->control.exit_int_info_err = vcpu->arch.exception.error_code; } @@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, exit_int_info |= SVM_EVTINJ_TYPE_INTR; } - nested_vmcb->control.exit_int_info = exit_int_info; + vmcb12->control.exit_int_info = exit_int_info; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); - kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); - kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + svm_set_efer(&svm->vcpu, vmcb12->save.efer); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); + svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); + kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); + kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; + svm->vmcb->save.rax = vmcb12->save.rax; + svm->vmcb->save.rsp = vmcb12->save.rsp; + svm->vmcb->save.rip = vmcb12->save.rip; + svm->vmcb->save.dr7 = vmcb12->save.dr7; + svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.cpl = vmcb12->save.cpl; } static void nested_prepare_vmcb_control(struct vcpu_svm *svm) @@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb) +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, + struct vmcb *vmcb12) { int ret; - svm->nested.vmcb = vmcb_gpa; - load_nested_vmcb_control(svm, &nested_vmcb->control); - nested_prepare_vmcb_save(svm, nested_vmcb); + svm->nested.vmcb12_gpa = vmcb12_gpa; + load_nested_vmcb_control(svm, &vmcb12->control); + nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); - ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); if (ret) return ret; @@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, int nested_svm_vmrun(struct vcpu_svm *svm) { int ret; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - u64 vmcb_gpa; + u64 vmcb12_gpa; if (is_smm(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } - vmcb_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + vmcb12_gpa = svm->vmcb->save.rax; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); return 1; @@ -471,26 +479,31 @@ int nested_svm_vmrun(struct vcpu_svm *svm) ret = kvm_skip_emulated_instruction(&svm->vcpu); - nested_vmcb = map.hva; + vmcb12 = map.hva; + + if (WARN_ON_ONCE(!svm->nested.initialized)) + return -EINVAL; - if (!nested_vmcb_checks(svm, nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; + if (!nested_vmcb_checks(svm, vmcb12)) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -522,7 +535,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -563,23 +576,23 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { int rc; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); /* in case we halted in L2 */ @@ -587,45 +600,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); - nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); - nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; - - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - - if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, nested_vmcb); + vmcb12->save.es = vmcb->save.es; + vmcb12->save.cs = vmcb->save.cs; + vmcb12->save.ss = vmcb->save.ss; + vmcb12->save.ds = vmcb->save.ds; + vmcb12->save.gdtr = vmcb->save.gdtr; + vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.efer = svm->vcpu.arch.efer; + vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); + vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr4 = svm->vcpu.arch.cr4; + vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); + vmcb12->save.rip = kvm_rip_read(&svm->vcpu); + vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); + vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr6 = svm->vcpu.arch.dr6; + vmcb12->save.cpl = vmcb->save.cpl; + + vmcb12->control.int_state = vmcb->control.int_state; + vmcb12->control.exit_code = vmcb->control.exit_code; + vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + + if (vmcb12->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, vmcb12); if (svm->nrips_enabled) - nested_vmcb->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb->control.next_rip; - nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; - nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; - nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; - nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; + vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + vmcb12->control.event_inj = svm->nested.ctl.event_inj; + vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - nested_vmcb->control.pause_filter_count = + vmcb12->control.pause_filter_count = svm->vmcb->control.pause_filter_count; - nested_vmcb->control.pause_filter_thresh = + vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; /* Restore the original control entries */ @@ -659,11 +672,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); - trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, - nested_vmcb->control.exit_info_1, - nested_vmcb->control.exit_info_2, - nested_vmcb->control.exit_int_info, - nested_vmcb->control.exit_int_info_err, + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, + vmcb12->control.exit_info_1, + vmcb12->control.exit_info_2, + vmcb12->control.exit_int_info, + vmcb12->control.exit_int_info_err, KVM_ISA_SVM); kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -688,6 +701,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +int svm_allocate_nested(struct vcpu_svm *svm) +{ + struct page *hsave_page; + + if (svm->nested.initialized) + return 0; + + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!hsave_page) + return -ENOMEM; + svm->nested.hsave = page_address(hsave_page); + + svm->nested.msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->nested.msrpm) + goto err_free_hsave; + svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); + + svm->nested.initialized = true; + return 0; + +err_free_hsave: + __free_page(hsave_page); + return -ENOMEM; +} + +void svm_free_nested(struct vcpu_svm *svm) +{ + if (!svm->nested.initialized) + return; + + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + + __free_page(virt_to_page(svm->nested.hsave)); + svm->nested.hsave = NULL; + + svm->nested.initialized = false; +} + /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ @@ -702,6 +754,8 @@ void svm_leave_nested(struct vcpu_svm *svm) copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); } + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -709,7 +763,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -736,7 +790,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -767,14 +821,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.ctl.intercept_cr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.ctl.intercept_dr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -792,8 +844,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.ctl.intercept & exit_bits) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -833,7 +884,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; - return (svm->nested.ctl.intercept_exceptions & (1 << nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); } static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) @@ -901,7 +952,7 @@ static void nested_svm_intr(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static void nested_svm_init(struct vcpu_svm *svm) @@ -982,7 +1033,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) + if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1020,7 +1072,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { - kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1094,7 +1146,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { svm_leave_nested(svm); - goto out_set_gif; + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; } if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) @@ -1143,16 +1196,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); - if (!nested_svm_vmrun_msrpm(svm)) - return -EINVAL; - -out_set_gif: - svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: kfree(save); @@ -1163,6 +1211,7 @@ out_free: struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3c9a45efdd4d..c0b14106258a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -447,10 +447,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* - * The LAUNCH_UPDATE command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in + * place; the cache may contain the data that was written unencrypted. */ sev_clflush_pages(inpages, npages); @@ -806,10 +804,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) } /* - * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify + * the pages; flush the destination too so that future accesses do not + * see stale data. */ sev_clflush_pages(src_p, 1); sev_clflush_pages(dst_p, 1); @@ -857,7 +854,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; - unsigned long n; + unsigned long n, i; int ret, offset; if (!sev_guest(kvm)) @@ -871,6 +868,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return PTR_ERR(pages); /* + * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in + * place; the cache may contain the data that was written unencrypted. + */ + sev_clflush_pages(pages, n); + + /* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command. */ @@ -915,6 +918,11 @@ e_free_blob: e_free: kfree(data); e_unpin_memory: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < n; i++) { + set_page_dirty_lock(pages[i]); + mark_page_accessed(pages[i]); + } sev_unpin_memory(kvm, pages, n); return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9709c98d0d6c..2f32fd09e259 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { +} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 @@ -263,9 +263,10 @@ static int get_max_npt_level(void) #endif } -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); + u64 old_efer = vcpu->arch.efer; vcpu->arch.efer = efer; if (!npt_enabled) { @@ -276,13 +277,32 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); - svm_set_gif(svm, true); + if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + + /* + * Free the nested guest state, unless we are in SMM. + * In this case we will return to the nested guest + * as soon as we leave SMM. + */ + if (!is_smm(&svm->vcpu)) + svm_free_nested(svm); + + } else { + int ret = svm_allocate_nested(svm); + + if (ret) { + vcpu->arch.efer = old_efer; + return ret; + } + } } svm->vmcb->save.efer = efer | EFER_SVME; vmcb_mark_dirty(svm->vmcb, VMCB_CR); + return 0; } static int is_external_interrupt(u32 info) @@ -553,18 +573,44 @@ free_cpu_data: } -static bool valid_msr_intercept(u32 index) +static int direct_access_msr_slot(u32 msr) { - int i; + u32 i; for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; + if (direct_access_msrs[i].index == msr) + return i; - return false; + return -ENOENT; +} + +static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, + int write) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int slot = direct_access_msr_slot(msr); + + if (slot == -ENOENT) + return; + + /* Set the shadow bitmaps to the desired intercept states */ + if (read) + set_bit(slot, svm->shadow_msr_intercept.read); + else + clear_bit(slot, svm->shadow_msr_intercept.read); + + if (write) + set_bit(slot, svm->shadow_msr_intercept.write); + else + clear_bit(slot, svm->shadow_msr_intercept.write); } -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) +static bool valid_msr_intercept(u32 index) +{ + return direct_access_msr_slot(index) != -ENOENT; +} + +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { u8 bit_write; unsigned long tmp; @@ -583,8 +629,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) return !!test_bit(bit_write, &tmp); } -static void set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) +static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, + u32 msr, int read, int write) { u8 bit_read, bit_write; unsigned long tmp; @@ -596,6 +642,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, */ WARN_ON(!valid_msr_intercept(msr)); + /* Enforce non allowed MSRs to trap */ + if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + read = 0; + + if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + write = 0; + offset = svm_msrpm_offset(msr); bit_read = 2 * (msr & 0x0f); bit_write = 2 * (msr & 0x0f) + 1; @@ -609,17 +662,60 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, msrpm[offset] = tmp; } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { - int i; + set_shadow_msr_intercept(vcpu, msr, read, write); + set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); +} + +u32 *svm_vcpu_alloc_msrpm(void) +{ + struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + u32 *msrpm; + + if (!pages) + return NULL; + msrpm = page_address(pages); memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + return msrpm; +} + +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +{ + int i; + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { if (!direct_access_msrs[i].always) continue; + set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); + } +} - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + +void svm_vcpu_free_msrpm(u32 *msrpm) +{ + __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); +} + +static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 i; + + /* + * Set intercept permissions for all direct access MSRs again. They + * will automatically get filtered through the MSR filter, so we are + * back in sync after this. + */ + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 msr = direct_access_msrs[i].index; + u32 read = test_bit(i, svm->shadow_msr_intercept.read); + u32 write = test_bit(i, svm->shadow_msr_intercept.write); + + set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); } } @@ -666,26 +762,26 @@ static void init_msrpm_offsets(void) } } -static void svm_enable_lbrv(struct vcpu_svm *svm) +static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -static void svm_disable_lbrv(struct vcpu_svm *svm) +static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -813,6 +909,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* Enable INVPCID feature */ + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) @@ -985,6 +1084,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } +static void svm_check_invpcid(struct vcpu_svm *svm) +{ + /* + * Intercept INVPCID instruction only if shadow page table is + * enabled. Interception is not required with nested page table + * enabled. + */ + if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { + if (!npt_enabled) + svm_set_intercept(svm, INTERCEPT_INVPCID); + else + svm_clr_intercept(svm, INTERCEPT_INVPCID); + } +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -992,14 +1106,14 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vcpu.arch.hflags = 0; - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR3_READ); + svm_set_intercept(svm, INTERCEPT_CR4_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR3_WRITE); + svm_set_intercept(svm, INTERCEPT_CR4_WRITE); if (!kvm_vcpu_apicv_active(&svm->vcpu)) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1094,15 +1208,15 @@ static void init_vmcb(struct vcpu_svm *svm) control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR3_READ); + svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } svm->asid_generation = 0; - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; if (!kvm_pause_in_guest(svm->vcpu.kvm)) { @@ -1114,6 +1228,8 @@ static void init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + svm_check_invpcid(svm); + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); @@ -1171,35 +1287,20 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *page; - struct page *msrpm_pages; - struct page *hsave_page; - struct page *nested_msrpm_pages; + struct page *vmcb_page; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); err = -ENOMEM; - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) + vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb_page) goto out; - msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) - goto free_page1; - - nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) - goto free_page2; - - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!hsave_page) - goto free_page3; - err = avic_init_vcpu(svm); if (err) - goto free_page4; + goto error_free_vmcb_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1207,18 +1308,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) svm->avic_is_running = true; - svm->nested.hsave = page_address(hsave_page); - clear_page(svm->nested.hsave); - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + svm->msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->msrpm) + goto error_free_vmcb_page; - svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb = page_address(page); - clear_page(svm->vmcb); - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); + svm->vmcb = page_address(vmcb_page); + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1227,14 +1324,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -free_page4: - __free_page(hsave_page); -free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); -free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); -free_page1: - __free_page(page); +error_free_vmcb_page: + __free_page(vmcb_page); out: return err; } @@ -1258,10 +1349,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) */ svm_clear_current_vmcb(svm->vmcb); + svm_free_nested(svm); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1549,11 +1640,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); } } @@ -2224,12 +2315,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, { unsigned long cr0 = svm->vcpu.arch.cr0; bool ret = false; - u64 intercept; - - intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || - (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2267,6 +2355,7 @@ static int cr_interception(struct vcpu_svm *svm) if (cr >= 16) { /* mov to cr */ cr -= 16; val = kvm_register_read(&svm->vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: if (!check_selective_cr0_intercepted(svm, val)) @@ -2312,6 +2401,7 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } kvm_register_write(&svm->vcpu, reg, val); + trace_kvm_cr_read(cr, val); } return kvm_complete_insn_gp(&svm->vcpu, err); } @@ -2562,7 +2652,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && @@ -2577,7 +2667,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -2641,9 +2731,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.dbgctl = data; vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) - svm_enable_lbrv(svm); + svm_enable_lbrv(vcpu); else - svm_disable_lbrv(svm); + svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: svm->nested.hsave_msr = data; @@ -2739,6 +2829,33 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +static int invpcid_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long type; + gva_t gva; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* + * For an INVPCID intercept: + * EXITINFO1 provides the linear address of the memory operand. + * EXITINFO2 provides the contents of the register operand. + */ + type = svm->vmcb->control.exit_info_2; + gva = svm->vmcb->control.exit_info_1; + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_handle_invpcid(vcpu, type, gva); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -2801,6 +2918,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -2819,12 +2937,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) } pr_err("VMCB Control Area:\n"); - pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); - pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); - pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); - pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); - pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); - pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); + pr_err("%-20s%08x %08x\n", "intercepts:", + control->intercepts[INTERCEPT_WORD3], + control->intercepts[INTERCEPT_WORD4]); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); pr_err("%-20s%d\n", "pause filter threshold:", control->pause_filter_thresh); @@ -2923,12 +3043,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; *info1 = control->exit_info_1; *info2 = control->exit_info_2; + *intr_info = control->exit_int_info; + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->exit_int_info_err; + else + *error_code = 0; } static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) @@ -2939,7 +3066,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; @@ -2947,12 +3074,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); + trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3062,13 +3184,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (nested_svm_virtualize_tpr(vcpu)) return; - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); if (irr == -1) return; if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } bool svm_nmi_blocked(struct kvm_vcpu *vcpu) @@ -3256,7 +3378,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (nested_svm_virtualize_tpr(vcpu)) return; - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); } @@ -3353,8 +3475,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3419,7 +3540,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3460,9 +3580,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -3542,8 +3660,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); - exit_fastpath = svm_exit_handlers_fastpath(vcpu); - return exit_fastpath; + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + return svm_exit_handlers_fastpath(vcpu); } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, @@ -3629,6 +3750,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + /* Check again if INVPCID interception if required */ + svm_check_invpcid(svm); + if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -3743,7 +3867,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - u64 intercept; if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; @@ -3752,9 +3875,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.ctl.intercept; - - if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + if (!(vmcb_is_intercept(&svm->nested.ctl, + INTERCEPT_SELECTIVE_CR0))) break; cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; @@ -3889,7 +4011,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) /* FED8h - SVM Guest */ put_smstate(u64, smstate, 0x7ed8, 1); /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3911,7 +4033,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) @@ -3921,10 +4043,13 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 1; if (kvm_vcpu_map(&svm->vcpu, - gpa_to_gfn(vmcb), &map) == -EINVAL) + gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; + + if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(svm, vmcb, map.hva); + ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); kvm_vcpu_unmap(&svm->vcpu, &map, true); } } @@ -3945,19 +4070,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { - unsigned long cr4 = kvm_read_cr4(vcpu); - bool smep = cr4 & X86_CR4_SMEP; - bool smap = cr4 & X86_CR4_SMAP; - bool is_user = svm_get_cpl(vcpu) == 3; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + bool smep, smap, is_user; + unsigned long cr4; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -3999,6 +4115,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ + if (likely(!insn || insn_len)) + return true; + + /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + cr4 = kvm_read_cr4(vcpu); + smep = cr4 & X86_CR4_SMEP; + smap = cr4 & X86_CR4_SMAP; + is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; @@ -4022,7 +4152,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) * if an INIT signal is pending. */ return !gif_set(svm) || - (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); + (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } static void svm_vm_destroy(struct kvm *kvm) @@ -4160,9 +4290,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .msr_filter_changed = svm_msr_filter_changed, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a798e1731709..1d853fe4c778 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define MAX_DIRECT_ACCESS_MSRS 15 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -85,8 +86,7 @@ struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; - u64 vmcb; - u32 host_intercept_exceptions; + u64 vmcb12_gpa; /* These are the merged vectors */ u32 *msrpm; @@ -97,6 +97,8 @@ struct svm_nested_state { /* cache for control fields of the guest */ struct vmcb_control_area ctl; + + bool initialized; }; struct vcpu_svm { @@ -158,6 +160,12 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* Save desired MSR intercept (read: pass-through) state */ + struct { + DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); + DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); + } shadow_msr_intercept; }; struct svm_cpu_data { @@ -214,51 +222,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) return svm->vmcb; } -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __set_bit(bit, (unsigned long *)&control->intercepts); } -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __clear_bit(bit, (unsigned long *)&control->intercepts); } -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); } static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) - | (1 << INTERCEPT_DR1_READ) - | (1 << INTERCEPT_DR2_READ) - | (1 << INTERCEPT_DR3_READ) - | (1 << INTERCEPT_DR4_READ) - | (1 << INTERCEPT_DR5_READ) - | (1 << INTERCEPT_DR6_READ) - | (1 << INTERCEPT_DR7_READ) - | (1 << INTERCEPT_DR0_WRITE) - | (1 << INTERCEPT_DR1_WRITE) - | (1 << INTERCEPT_DR2_WRITE) - | (1 << INTERCEPT_DR3_WRITE) - | (1 << INTERCEPT_DR4_WRITE) - | (1 << INTERCEPT_DR5_WRITE) - | (1 << INTERCEPT_DR6_WRITE) - | (1 << INTERCEPT_DR7_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } @@ -267,25 +268,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = 0; + vmcb->control.intercepts[INTERCEPT_DR] = 0; recalc_intercepts(svm); } -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions |= (1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions &= ~(1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } @@ -294,7 +297,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept |= (1ULL << bit); + vmcb_set_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -303,14 +306,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept &= ~(1ULL << bit); + vmcb_clr_intercept(&vmcb->control, bit); recalc_intercepts(svm); } static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { - return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; + return vmcb_is_intercept(&svm->vmcb->control, bit); } static inline bool vgif_enabled(struct vcpu_svm *svm) @@ -345,11 +348,15 @@ static inline bool gif_set(struct vcpu_svm *svm) /* svm.c */ #define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U #define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); +u32 *svm_vcpu_alloc_msrpm(void); +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); +void svm_vcpu_free_msrpm(u32 *msrpm); + +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -374,22 +381,24 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); +void svm_free_nested(struct vcpu_svm *svm); +int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b66432b015d2..aef960f90f26 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,18 +15,20 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ), TP_fast_assign( - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ), - TP_printk("vcpu %u", __entry->vcpu_id) + TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ); /* @@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic, (isa == KVM_ISA_VMX) ? \ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" +#define TRACE_EVENT_KVM_EXIT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(exit_reason, vcpu, isa), \ + \ + TP_STRUCT__entry( \ + __field( unsigned int, exit_reason ) \ + __field( unsigned long, guest_rip ) \ + __field( u32, isa ) \ + __field( u64, info1 ) \ + __field( u64, info2 ) \ + __field( u32, intr_info ) \ + __field( u32, error_code ) \ + __field( unsigned int, vcpu_id ) \ + ), \ + \ + TP_fast_assign( \ + __entry->exit_reason = exit_reason; \ + __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->isa = isa; \ + __entry->vcpu_id = vcpu->vcpu_id; \ + kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ + ), \ + \ + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + __entry->vcpu_id, \ + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ + __entry->guest_rip, __entry->info1, __entry->info2, \ + __entry->intr_info, __entry->error_code) \ +) + /* * Tracepoint for kvm guest exit: */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - __entry->vcpu_id = vcpu->vcpu_id; - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", - __entry->vcpu_id, - kvm_print_exit_reason(__entry->exit_reason, __entry->isa), - __entry->guest_rip, __entry->info1, __entry->info2) -); +TRACE_EVENT_KVM_EXIT(kvm_exit); /* * Tracepoint for kvm interrupt injection: @@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun, ); TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, + __u32 intercept1, __u32 intercept2, __u32 intercept3), + TP_ARGS(cr_read, cr_write, exceptions, intercept1, + intercept2, intercept3), TP_STRUCT__entry( __field( __u16, cr_read ) __field( __u16, cr_write ) __field( __u32, exceptions ) - __field( __u64, intercept ) + __field( __u32, intercept1 ) + __field( __u32, intercept2 ) + __field( __u32, intercept3 ) ), TP_fast_assign( __entry->cr_read = cr_read; __entry->cr_write = cr_write; __entry->exceptions = exceptions; - __entry->intercept = intercept; + __entry->intercept1 = intercept1; + __entry->intercept2 = intercept2; + __entry->intercept3 = intercept3; ), - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) + TP_printk("cr_read: %04x cr_write: %04x excp: %08x " + "intercepts: %08x %08x %08x", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept1, __entry->intercept2, __entry->intercept3) ); /* * Tracepoint for #VMEXIT while nested */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - kvm_print_exit_reason(__entry->exit_code, __entry->isa), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); +TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); /* * Tracepoint for #VMEXIT reinjected to the guest diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4bbd8b448d22..3a1861403d73 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void) static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_ENABLE_RDTSCP; } static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) @@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vmx_rdrand_supported(void) +static inline bool cpu_has_vmx_rdrand(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDRAND_EXITING; @@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void) SECONDARY_EXEC_ENCLS_EXITING; } -static inline bool vmx_rdseed_supported(void) +static inline bool cpu_has_vmx_rdseed(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDSEED_EXITING; @@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool vmx_xsaves_supported(void) +static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_XSAVES; } -static inline bool vmx_waitpkg_supported(void) +static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 19e2265956ba..89af692deb7e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs = NULL; } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; + int cpu; + + if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) + return; + + cpu = get_cpu(); + prev = vmx->loaded_vmcs; + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_sync_vmcs_host_state(vmx, prev); + put_cpu(); + + vmx_register_cache_reset(vcpu); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; @@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } -static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, - struct loaded_vmcs *prev) -{ - struct vmcs_host_state *dest, *src; - - if (unlikely(!vmx->guest_state_loaded)) - return; - - src = &prev->host_state; - dest = &vmx->loaded_vmcs->host_state; - - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); - dest->ldt_sel = src->ldt_sel; -#ifdef CONFIG_X86_64 - dest->ds_sel = src->ds_sel; - dest->es_sel = src->es_sel; -#endif -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *prev; - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - prev = vmx->loaded_vmcs; - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); - vmx_sync_vmcs_host_state(vmx, prev); - put_cpu(); - - vmx_register_cache_reset(vcpu); -} - /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). @@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); vcpu_put(vcpu); } @@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int index = vmx_find_msr_index(&vmx->msr_autostore.guest, - MSR_IA32_TSC); + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + MSR_IA32_TSC); - if (index >= 0) { - u64 val = vmx->msr_autostore.guest.val[index].value; + if (i >= 0) { + u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; - int msr_autostore_index; + int msr_autostore_slot; bool in_autostore_list; int last; - msr_autostore_index = vmx_find_msr_index(autostore, msr_index); - in_autostore_list = msr_autostore_index >= 0; + msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); + in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == NR_LOADSTORE_MSRS) { + if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by @@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; - autostore->val[msr_autostore_index] = autostore->val[last]; + autostore->val[msr_autostore_slot] = autostore->val[last]; } } @@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + secondary_exec_controls_set(vmx, exec_control); } @@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmx->segment_cache.bitmask = 0; } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. */ - if (vmx->emulation_required) { + if (CC(!vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - if (unlikely(!nested_get_vmcs12_pages(vcpu))) + if (unlikely(!nested_get_vmcs12_pages(vcpu))) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; + } if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); @@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* @@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { return nested_vmx_failInvalid(vcpu); } - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ - if (vmcs12->hdr.shadow_vmcs) + if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (vmx->nested.hv_evmcs) { @@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - if (vmcs12->launch_state == launch) + if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); @@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; + /* Emulate processing of posted interrupts on VM-Enter. */ + if (nested_cpu_has_posted_intr(vmcs12) && + kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); + } + /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { - struct shared_msr_entry *efer_msr; + struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) @@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmx->msr_autoload.guest.val[i].value; } - efer_msr = find_msr_entry(vmx, MSR_EFER); + efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; @@ -4696,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { - *ret = vmx_handle_memory_failure(vcpu, r, &e); + *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } @@ -4760,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); } return 0; @@ -5003,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); @@ -5076,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); @@ -5238,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } @@ -5291,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, @@ -5373,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, @@ -5918,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - exit_intr_info = vmx_get_intr_info(vcpu); - exit_qual = vmx_get_exit_qual(vcpu); - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, - vmx->idt_vectoring_info, exit_intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -5940,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + exit_intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } + exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); @@ -6182,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; } @@ -6318,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | @@ -6337,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); @@ -6391,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6561,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = nested_get_vmcs12_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c new file mode 100644 index 000000000000..e4e7adff818c --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> + +#include <asm/irq_remapping.h> +#include <asm/cpu.h> + +#include "lapic.h" +#include "posted_intr.h" +#include "trace.h" +#include "vmx.h" + +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + + /* The full case. */ + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + new.sn = 0; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + +after_clear_sn: + + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + if (!pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); +} + +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +int pi_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return 0; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } + + do { + old.control = new.control = pi_desc->control; + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); +} + +void pi_post_block(struct kvm_vcpu *vcpu) +{ + if (vcpu->pre_pcpu == -1) + return; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +void pi_wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +void __init pi_init(int cpu) +{ + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); +} + + +/* + * pi_update_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h new file mode 100644 index 000000000000..e53b97f82097 --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_POSTED_INTR_H +#define __KVM_X86_VMX_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); +int pi_pre_block(struct kvm_vcpu *vcpu); +void pi_post_block(struct kvm_vcpu *vcpu); +void pi_wakeup_handler(void); +void __init pi_init(int cpu); +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set); + +#endif /* __KVM_X86_VMX_POSTED_INTR_H */
\ No newline at end of file diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7a3675fddec2..1472c6c376f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); } +static inline bool is_exception_with_error_code(u32 intr_info) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK; + + return (intr_info & mask) == mask; +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 799db084a336..90ad7a6246e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -4,6 +4,7 @@ #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> +#include <asm/segment.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -294,3 +295,36 @@ SYM_FUNC_START(vmread_error_trampoline) ret SYM_FUNC_END(vmread_error_trampoline) + +SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + CALL_NOSPEC _ASM_ARG1 + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + ret +SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0a9954c49db..ab6d2d1525ec 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -56,7 +56,6 @@ #include "lapic.h" #include "mmu.h" #include "nested.h" -#include "ops.h" #include "pmu.h" #include "trace.h" #include "vmcs.h" @@ -149,8 +148,25 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) -#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ - (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) +/* + * List of MSRs that can be directly passed to the guest. + * In addition to these x2apic and PT MSRs are handled specially. + */ +static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_TSC, + MSR_FS_BASE, + MSR_GS_BASE, + MSR_KERNEL_GS_BASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_CORE_C1_RES, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, +}; /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: @@ -344,9 +360,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_vmexit(void); @@ -401,13 +416,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs); */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -450,9 +458,9 @@ static unsigned long host_idt_base; * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, IA32_STAR must always be included in - * vmx_msr_index[], even in i386 builds. + * vmx_uret_msrs_list[], even in i386 builds. */ -const u32 vmx_msr_index[] = { +static const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif @@ -626,36 +634,71 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +static int possible_passthrough_msr_slot(u32 msr) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + + return -ENOENT; +} + +static bool is_valid_passthrough_msr(u32 msr) +{ + bool r; + + switch (msr) { + case 0x800 ... 0x8ff: + /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ + return true; + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + return true; + } + + r = possible_passthrough_msr_slot(msr) != -ENOENT; + + WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + + return r; +} + +static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + for (i = 0; i < vmx->nr_uret_msrs; ++i) + if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) return i; return -1; } -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __find_msr_index(vmx, msr); + i = __vmx_find_uret_msr(vmx, msr); if (i >= 0) - return &vmx->guest_msrs[i]; + return &vmx->guest_uret_msrs[i]; return NULL; } -static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, + struct vmx_uret_msr *msr, u64 data) { int ret = 0; u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -840,7 +883,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -874,7 +917,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -882,7 +925,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = vmx_find_msr_index(&m->host, msr); + i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; @@ -941,12 +984,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) - j = vmx_find_msr_index(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { + if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -969,10 +1012,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host.val[j].value = host_val; } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; + int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) @@ -1004,17 +1048,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) else clear_atomic_switch_msr(vmx, MSR_EFER); return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); + } + + i = __vmx_find_uret_msr(vmx, MSR_EFER); + if (i < 0) + return false; - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + clear_atomic_switch_msr(vmx, MSR_EFER); - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; - return true; - } + vmx->guest_uret_msrs[i].data = guest_efer; + vmx->guest_uret_msrs[i].mask = ~ignore_bits; + + return true; } #ifdef CONFIG_X86_32 @@ -1052,6 +1100,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } +static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) +{ + /* The base must be 128-byte aligned and a legal physical address. */ + return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -1156,12 +1210,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->guest_msrs_ready) { - vmx->guest_msrs_ready = true; - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); + if (!vmx->guest_uret_msrs_loaded) { + vmx->guest_uret_msrs_loaded = true; + for (i = 0; i < vmx->nr_active_uret_msrs; ++i) + kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, + vmx->guest_uret_msrs[i].data, + vmx->guest_uret_msrs[i].mask); } @@ -1245,7 +1299,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; - vmx->guest_msrs_ready = false; + vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 @@ -1268,62 +1322,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - -after_clear_sn: - - /* - * Clear SN before reading the bitmap. The VT-d firmware - * writes the bitmap and reads SN atomically (5.2.3 in the - * spec), so it doesn't really have a memory barrier that - * pairs with this, but we cannot do that and we need one. - */ - smp_mb__after_atomic(); - - if (!pi_is_pir_empty(pi_desc)) - pi_set_on(pi_desc); -} - void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1407,20 +1405,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1430,7 +1414,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static bool emulation_required(struct kvm_vcpu *vcpu) { - return emulate_invalid_guest_state && !guest_state_valid(vcpu); + return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -1456,7 +1440,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; - if (enable_unrestricted_guest) { + if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); @@ -1576,6 +1560,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +{ + return true; +} + static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip, orig_rip; @@ -1614,33 +1603,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } /* - * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns - * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value - * indicates whether exit to userspace is needed. - */ -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e) -{ - if (r == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_emulated_page_fault(vcpu, e); - return 1; - } - - /* - * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED - * while handling a VMX instruction KVM could've handled the request - * correctly by exiting to userspace and performing I/O but there - * doesn't seem to be a real use-case behind such requests, just return - * KVM_EXIT_INTERNAL_ERROR for now. - */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - - return 0; -} - -/* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ @@ -1723,16 +1685,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) { - struct shared_msr_entry tmp; + struct vmx_uret_msr tmp; + int from, to; - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; + from = __vmx_find_uret_msr(vmx, msr); + if (from < 0) + return; + to = vmx->nr_active_uret_msrs++; + + tmp = vmx->guest_uret_msrs[to]; + vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; + vmx->guest_uret_msrs[from] = tmp; } /* @@ -1742,38 +1707,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; - - save_nmsrs = 0; + vmx->guest_uret_msrs_loaded = false; + vmx->nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - index = __find_msr_index(vmx, MSR_STAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + vmx_setup_uret_msr(vmx, MSR_STAR); + vmx_setup_uret_msr(vmx, MSR_LSTAR); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); } #endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_ready = false; + if (update_transition_efer(vmx)) + vmx_setup_uret_msr(vmx, MSR_EFER); + + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + vmx_setup_uret_msr(vmx, MSR_TSC_AUX); + + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -1843,7 +1796,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { @@ -1864,7 +1817,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1971,10 +1924,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_info->index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; @@ -2003,7 +1956,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; @@ -2097,7 +2050,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; @@ -2107,7 +2060,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2133,8 +2086,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) @@ -2184,7 +2136,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) @@ -2209,7 +2161,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; - if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) + if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; @@ -2244,13 +2196,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_index); if (msr) - ret = vmx_set_guest_msr(vmx, msr, data); + ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } @@ -2282,7 +2234,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + if (is_unrestricted_guest(vcpu) || + (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2463,7 +2416,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2877,13 +2830,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); + /* Nothing to do if hardware doesn't support EFER. */ if (!msr) - return; + return 0; vcpu->arch.efer = efer; if (efer & EFER_LMA) { @@ -2895,6 +2849,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } setup_msrs(vmx); + return 0; } #ifdef CONFIG_X86_64 @@ -3048,7 +3003,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) unsigned long hw_cr0; hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3069,7 +3024,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !enable_unrestricted_guest) + if (enable_ept && !is_unrestricted_guest(vcpu)) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -3149,7 +3104,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long hw_cr4; hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3184,7 +3139,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!enable_unrestricted_guest) { + if (!is_unrestricted_guest(vcpu)) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; @@ -3324,7 +3279,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); @@ -3513,11 +3468,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) * not. * We assume that registers are always usable */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { - if (enable_unrestricted_guest) - return true; - /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) @@ -3703,11 +3655,52 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -3715,36 +3708,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filters change. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); + if ((type & MSR_TYPE_R) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { + vmx_set_msr_bitmap_read(msr_bitmap, msr); + type &= ~MSR_TYPE_R; + } - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); + if ((type & MSR_TYPE_W) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { + vmx_set_msr_bitmap_write(msr_bitmap, msr); + type &= ~MSR_TYPE_W; + } - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { - int f = sizeof(unsigned long); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; @@ -3753,39 +3754,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filter changes. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_set_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) +static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value) { if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + vmx_enable_intercept_for_msr(vcpu, msr, type); else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); + vmx_disable_intercept_for_msr(vcpu, msr, type); } static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) @@ -3803,35 +3799,47 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) +static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { + unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + unsigned long read_intercept; int msr; + read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + unsigned int read_idx = msr / BITS_PER_LONG; + unsigned int write_idx = read_idx + (0x800 / sizeof(long)); + + msr_bitmap[read_idx] = read_intercept; + msr_bitmap[write_idx] = ~0ul; } +} - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +{ + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_reset_x2apic_msrs(vcpu, mode); + + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_X2APIC)); + + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; u8 mode = vmx_msr_bitmap_mode(vcpu); u8 changed = mode ^ vmx->msr_bitmap_mode; @@ -3839,30 +3847,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) return; if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + vmx_update_msr_bitmap_x2apic(vcpu, mode); vmx->msr_bitmap_mode = mode; } -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, - MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.addr_range; i++) { - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } @@ -3886,6 +3888,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) return ((rvi & 0xf0) > (vppr & 0xf0)); } +static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 i; + + /* + * Set intercept permissions for all potentially passed through MSRs + * again. They will automatically get filtered through the MSR filter, + * so we are back in sync after this. + */ + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + u32 msr = vmx_possible_passthrough_msrs[i]; + bool read = test_bit(i, vmx->shadow_msr_intercept.read); + bool write = test_bit(i, vmx->shadow_msr_intercept.write); + + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + } + + pt_update_intercept_for_msr(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -4043,13 +4068,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + struct kvm_vcpu *vcpu = &vmx->vcpu; + + vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & + ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vcpu->arch.cr4_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) @@ -4114,6 +4142,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +/* + * Adjust a single secondary execution control bit to intercept/allow an + * instruction in the guest. This is usually done based on whether or not a + * feature has been exposed to the guest in order to correctly emulate faults. + */ +static inline void +vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, + u32 control, bool enabled, bool exiting) +{ + /* + * If the control is for an opt-in feature, clear the control if the + * feature is not exposed to the guest, i.e. not enabled. If the + * control is opt-out, i.e. an exiting control, clear the control if + * the feature _is_ exposed to the guest, i.e. exiting/interception is + * disabled for the associated instruction. Note, the caller is + * responsible presetting exec_control to set all supported bits. + */ + if (enabled == exiting) + *exec_control &= ~control; + + /* + * Update the nested MSR settings so that a nested VMM can/can't set + * controls for features that are/aren't exposed to the guest. + */ + if (nested) { + if (enabled) + vmx->nested.msrs.secondary_ctls_high |= control; + else + vmx->nested.msrs.secondary_ctls_high &= ~control; + } +} + +/* + * Wrapper macro for the common case of adjusting a secondary execution control + * based on a single guest CPUID bit, with a dedicated feature bit. This also + * verifies that the control is actually supported by KVM and hardware. + */ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + __enabled = guest_cpuid_has(&(vmx)->vcpu, \ + X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, \ + SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ + } \ +}) + +/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ +#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) + +#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { @@ -4154,7 +4237,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (vmx_xsaves_supported()) { + if (cpu_has_vmx_xsaves()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4163,101 +4246,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vcpu->arch.xsaves_enabled = xsaves_enabled; - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } - } - - if (cpu_has_vmx_rdtscp()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - if (cpu_has_vmx_invpcid()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); - - if (!invpcid_enabled) { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } - - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (vmx_rdrand_supported()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_XSAVES, + xsaves_enabled, false); } - if (vmx_rdseed_supported()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; + vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } + /* + * Expose INVPCID if and only if PCID is also exposed to the guest. + * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF + * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect + * behavior from the guest perspective (it would expect #GP or #PF). + */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - if (vmx_waitpkg_supported()) { - bool waitpkg_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); - if (!waitpkg_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); - if (nested) { - if (waitpkg_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - } - } + vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, + ENABLE_USR_WAIT_PAUSE, false); vmx->secondary_exec_control = exec_control; } @@ -4350,7 +4361,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - if (vmx_xsaves_supported()) + if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { @@ -5154,7 +5165,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0); + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(vcpu); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -5337,7 +5349,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(allow_smaller_maxphyaddr && kvm_mmu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -5448,25 +5460,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -5530,16 +5523,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; - bool pcid_enabled; gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; } operand; - int r; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5562,68 +5550,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) sizeof(operand), &gva)) return 1; - r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); - if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); - - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: - /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. - */ - - fallthrough; - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ - } + return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -5752,10 +5679,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + *info1 = vmx_get_exit_qual(vcpu); - *info2 = vmx_get_intr_info(vcpu); + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + *info2 = vmx->idt_vectoring_info; + *intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + else + *error_code = 0; + } else { + *info2 = 0; + *intr_info = 0; + *error_code = 0; + } } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -6389,14 +6330,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - return pi_test_on(pi_desc) || - (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); -} - static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -6416,70 +6349,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } +void vmx_do_interrupt_nmi_irqoff(unsigned long entry); + +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +{ + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; + + kvm_before_interrupt(vcpu); + vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(intr_info)) { + if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(intr_info)) { + else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } + else if (is_nmi(intr_info)) + handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { - unsigned int vector; - unsigned long entry; -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif - gate_desc *desc; u32 intr_info = vmx_get_intr_info(vcpu); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - vector = intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)host_idt_base + vector; - entry = gate_offset(desc); - - kvm_before_interrupt(vcpu); - - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%rsp, %[sp]\n\t" - "and $-16, %%rsp\n\t" - "push %[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - "push %[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - [thunk_target]"r"(entry), -#ifdef CONFIG_X86_64 - [ss]"i"(__KERNEL_DS), -#endif - [cs]"i"(__KERNEL_CS) - ); - - kvm_after_interrupt(vcpu); + handle_interrupt_nmi_irqoff(vcpu, intr_info); } -STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { @@ -6806,9 +6712,7 @@ reenter_guest: if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -6952,20 +6856,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { + u32 index = vmx_uret_msrs_list[i]; u32 data_low, data_high; - int j = vmx->nmsrs; + int j = vmx->nr_uret_msrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; + vmx->guest_uret_msrs[j].slot = i; + vmx->guest_uret_msrs[j].data = 0; switch (index) { case MSR_IA32_TSX_CTRL: /* @@ -6973,32 +6877,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) * let's avoid changing CPUID bits under the host * kernel's feet. */ - vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; break; default: - vmx->guest_msrs[j].mask = -1ull; + vmx->guest_uret_msrs[j].mask = -1ull; break; } - ++vmx->nmsrs; + ++vmx->nr_uret_msrs; } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml; + /* The MSR bitmap starts with all ones */ + bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } vmx->msr_bitmap_mode = 0; @@ -7022,8 +6930,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept); + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -7343,13 +7250,18 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { - struct shared_msr_entry *msr; - msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + struct vmx_uret_msr *msr; + msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); - vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } + + set_cr4_guest_host_mask(vmx); + + /* Refresh #PF interception to account for MAXPHYADDR changes. */ + update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) @@ -7373,14 +7285,14 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0xD.1 */ supported_xss = 0; - if (!vmx_xsaves_supported()) + if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); - if (vmx_waitpkg_supported()) + if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } @@ -7436,7 +7348,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * Because it is marked as EmulateOnUD, we need to intercept it here. */ case x86_intercept_rdtscp: - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; @@ -7568,107 +7480,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } -} - -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); - - local_irq_enable(); - return (vcpu->pre_pcpu == -1); -} - static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7680,17 +7491,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - if (vcpu->pre_pcpu == -1) - return; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); -} - static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) @@ -7699,100 +7499,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) pi_post_block(vcpu); } -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7850,11 +7556,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) /* RSM will cause a vmexit anyway. */ } -static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) -{ - return false; -} - static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon; @@ -7961,7 +7662,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, @@ -7995,7 +7696,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = vmx_update_pi_irte, + .update_pi_irte = pi_update_irte, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8009,9 +7710,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, }; static __init int hardware_setup(void) @@ -8023,8 +7726,8 @@ static __init int hardware_setup(void) store_idt(&dt); host_idt_base = dt.address; - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; @@ -8161,7 +7864,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -8300,8 +8003,8 @@ static int __init vmx_init(void) for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + + pi_init(cpu); } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a0e47720f60c..f6f66e5c6510 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -9,8 +9,9 @@ #include "capabilities.h" #include "kvm_cache_regs.h" -#include "ops.h" +#include "posted_intr.h" #include "vmcs.h" +#include "vmx_ops.h" #include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -22,20 +23,20 @@ extern const u32 vmx_msr_index[]; #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 -#define NR_SHARED_MSRS 7 +#define MAX_NR_USER_RETURN_MSRS 7 #else -#define NR_SHARED_MSRS 4 +#define MAX_NR_USER_RETURN_MSRS 4 #endif -#define NR_LOADSTORE_MSRS 8 +#define MAX_NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; + struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; -struct shared_msr_entry { - unsigned index; +struct vmx_uret_msr { + unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ u64 data; u64 mask; }; @@ -49,29 +50,6 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - #define RTIT_ADDR_RANGE 4 struct pt_ctx { @@ -218,10 +196,10 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry guest_msrs[NR_SHARED_MSRS]; - int nmsrs; - int save_nmsrs; - bool guest_msrs_ready; + struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; + int nr_uret_msrs; + int nr_active_uret_msrs; + bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -301,6 +279,13 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + + /* Save desired MSR intercept (read: pass-through) state */ +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 + struct { + DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + } shadow_msr_intercept; }; enum ept_pointers_status { @@ -334,7 +319,7 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); @@ -343,6 +328,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); + void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -350,75 +336,12 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e); +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -499,11 +422,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -558,6 +476,19 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; } +static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) +{ + return enable_unrestricted_guest && (!is_guest_mode(vcpu) || + (secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_UNRESTRICTED_GUEST)); +} + +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); +static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce856e0ece84..397f599b20e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,6 +71,7 @@ #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <clocksource/hyperv_timer.h> @@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -#define KVM_NR_SHARED_MSRS 16 +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_shared_msrs_global { +struct kvm_user_return_msrs_global { int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; }; -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; +static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -266,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, } else { vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return 1; + return -ENOENT; } } @@ -293,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -303,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn) * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(user_return_msrs_global.msrs[slot], values->host); values->curr = values->host; } } } -void kvm_define_shared_msr(unsigned slot, u32 msr) +void kvm_define_user_return_msr(unsigned slot, u32 msr) { - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; + BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); + user_return_msrs_global.msrs[slot] = msr; + if (slot >= user_return_msrs_global.nr) + user_return_msrs_global.nr = slot + 1; } -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); +EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); -static void kvm_shared_msr_cpu_online(void) +static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; - for (i = 0; i < shared_msrs_global.nr; ++i) { - rdmsrl_safe(shared_msrs_global.msrs[i], &value); - smsr->values[i].host = value; - smsr->values[i].curr = value; + for (i = 0; i < user_return_msrs_global.nr; ++i) { + rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } } -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -1452,6 +1458,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1468,7 +1475,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops.set_efer(vcpu, efer); + r = kvm_x86_ops.set_efer(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1483,6 +1494,40 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up or x2APIC enabled, allow everything */ + if (!count || (index >= 0x800 && index <= 0x8ff)) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1494,6 +1539,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1550,6 +1598,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1585,12 +1636,91 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1608,8 +1738,21 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) + /* Bounce to user space */ + return 0; + + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + + /* MSR write failed? Inject a #GP */ + if (r > 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -1775,12 +1918,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; @@ -1788,6 +1925,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) struct pvclock_wall_clock wc; u64 wall_nsec; + kvm->arch.wall_clock = wall_clock; + if (!wall_clock) return; @@ -1820,6 +1959,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; + if (!(system_time & 1)) + return; + + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = true; + + return; +} + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -1979,12 +2146,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = vcpu->arch.l1_tsc_offset; - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; -} - /* * Multiply tsc by a fixed point number represented by ratio. * @@ -2046,14 +2207,13 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; - u64 data = msr->data; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2062,7 +2222,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -2132,9 +2292,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -2149,8 +2306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { @@ -2696,24 +2851,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; u8 *page; - int r; - r = -E2BIG; if (page_num >= blob_size) - goto out; - r = -ENOMEM; + return 1; + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) @@ -2731,6 +2881,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; @@ -2808,10 +2966,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -2945,7 +3105,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && @@ -2966,53 +3132,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(data & 1)) - break; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; - } case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3029,11 +3196,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3229,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * state this but appears to behave the same. * * On userspace reads and writes, however, we unconditionally - * operate L1's TSC value to ensure backwards-compatible + * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : @@ -3527,6 +3700,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4397,6 +4573,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -5047,6 +5228,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -5054,6 +5239,110 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + bool empty = true; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) + empty &= !filter.ranges[i].nmsrs; + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + kvm_clear_msr_filter(kvm); + + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5360,6 +5649,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -5721,6 +6013,9 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + return 1; + if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && @@ -6376,13 +6671,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6926,7 +7241,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6934,6 +7252,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); @@ -7528,9 +7847,9 @@ int kvm_arch_init(void *opaque) goto out_free_x86_fpu_cache; } - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } @@ -7563,7 +7882,7 @@ int kvm_arch_init(void *opaque) return 0; out_free_percpu: - free_percpu(shared_msrs); + free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: @@ -7590,7 +7909,7 @@ void kvm_arch_exit(void) #endif kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); + free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); } @@ -7731,11 +8050,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -7746,9 +8070,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; @@ -8379,8 +8709,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } @@ -8487,6 +8817,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8562,7 +8894,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); + trace_kvm_entry(vcpu); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -9576,7 +9908,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; kvm_hv_vcpu_postcreate(vcpu); @@ -9584,10 +9915,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); /* poll control enabled by default */ @@ -9624,6 +9952,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } @@ -9721,7 +10050,7 @@ int kvm_arch_hardware_enable(void) u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); + kvm_user_return_msr_cpu_online(); ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -10039,6 +10368,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10055,6 +10386,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -10785,6 +11118,111 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab696dcf0..3900ab0c6004 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -372,6 +371,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e); +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); #define KVM_MSR_RET_INVALID 2 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 42606a04ae85..82bf37a5c9ec 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1446,11 +1446,14 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) prefetchw(¤t->mm->mmap_lock); /* - * KVM has two types of events that are, logically, interrupts, but - * are unfortunately delivered using the #PF vector. These events are - * "you just accessed valid memory, but the host doesn't have it right - * now, so I'll put you to sleep if you continue" and "that memory - * you tried to access earlier is available now." + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delived via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the |