diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-24 11:58:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-24 11:58:57 -0700 |
commit | 1ebdbeb03efe89f01f15df038a589077df3d21f5 (patch) | |
tree | 06b6b7bb565668d136c060c5104481e48cbf71e2 /arch/x86/kvm | |
parent | efee6c79298fd823c569d501d041de85caa102a6 (diff) | |
parent | c9b8fecddb5bb4b67e351bbaeaa648a6f7456912 (diff) | |
download | linux-1ebdbeb03efe89f01f15df038a589077df3d21f5.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Proper emulation of the OSLock feature of the debug architecture
- Scalibility improvements for the MMU lock when dirty logging is on
- New VMID allocator, which will eventually help with SVA in VMs
- Better support for PMUs in heterogenous systems
- PSCI 1.1 support, enabling support for SYSTEM_RESET2
- Implement CONFIG_DEBUG_LIST at EL2
- Make CONFIG_ARM64_ERRATUM_2077057 default y
- Reduce the overhead of VM exit when no interrupt is pending
- Remove traces of 32bit ARM host support from the documentation
- Updated vgic selftests
- Various cleanups, doc updates and spelling fixes
RISC-V:
- Prevent KVM_COMPAT from being selected
- Optimize __kvm_riscv_switch_to() implementation
- RISC-V SBI v0.3 support
s390:
- memop selftest
- fix SCK locking
- adapter interruptions virtualization for secure guests
- add Claudio Imbrenda as maintainer
- first step to do proper storage key checking
x86:
- Continue switching kvm_x86_ops to static_call(); introduce
static_call_cond() and __static_call_ret0 when applicable.
- Cleanup unused arguments in several functions
- Synthesize AMD 0x80000021 leaf
- Fixes and optimization for Hyper-V sparse-bank hypercalls
- Implement Hyper-V's enlightened MSR bitmap for nested SVM
- Remove MMU auditing
- Eager splitting of page tables (new aka "TDP" MMU only) when dirty
page tracking is enabled
- Cleanup the implementation of the guest PGD cache
- Preparation for the implementation of Intel IPI virtualization
- Fix some segment descriptor checks in the emulator
- Allow AMD AVIC support on systems with physical APIC ID above 255
- Better API to disable virtualization quirks
- Fixes and optimizations for the zapping of page tables:
- Zap roots in two passes, avoiding RCU read-side critical
sections that last too long for very large guests backed by 4
KiB SPTEs.
- Zap invalid and defunct roots asynchronously via
concurrency-managed work queue.
- Allowing yielding when zapping TDP MMU roots in response to the
root's last reference being put.
- Batch more TLB flushes with an RCU trick. Whoever frees the
paging structure now holds RCU as a proxy for all vCPUs running
in the guest, i.e. to prolongs the grace period on their behalf.
It then kicks the the vCPUs out of guest mode before doing
rcu_read_unlock().
Generic:
- Introduce __vcalloc and use it for very large allocations that need
memcg accounting"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (246 commits)
KVM: use kvcalloc for array allocations
KVM: x86: Introduce KVM_CAP_DISABLE_QUIRKS2
kvm: x86: Require const tsc for RT
KVM: x86: synthesize CPUID leaf 0x80000021h if useful
KVM: x86: add support for CPUID leaf 0x80000021
KVM: x86: do not use KVM_X86_OP_OPTIONAL_RET0 for get_mt_mask
Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
kvm: x86/mmu: Flush TLB before zap_gfn_range releases RCU
KVM: arm64: fix typos in comments
KVM: arm64: Generalise VM features into a set of flags
KVM: s390: selftests: Add error memop tests
KVM: s390: selftests: Add more copy memop tests
KVM: s390: selftests: Add named stages for memop test
KVM: s390: selftests: Add macro as abstraction for MEM_OP
KVM: s390: selftests: Split memop tests
KVM: s390x: fix SCK locking
RISC-V: KVM: Implement SBI HSM suspend call
RISC-V: KVM: Add common kvm_riscv_vcpu_wfi() function
RISC-V: Add SBI HSM suspend related defines
RISC-V: KVM: Implement SBI v0.3 SRST extension
...
Diffstat (limited to 'arch/x86/kvm')
42 files changed, 2234 insertions, 1676 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2b1548da00eb..e3cbd7706136 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -126,13 +126,6 @@ config KVM_XEN If in doubt, say "N". -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - help - This option adds a R/W kVM module parameter 'mmu_audit', which allows - auditing of KVM MMU events at runtime. - config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8bf541b24b4b..a00cd97b2623 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -715,9 +715,30 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, entry = &array->entries[array->nent++]; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + case 0x80000000: + /* + * 0x80000021 is sometimes synthesized by __do_cpuid_func, which + * would result in out-of-bounds calls to do_host_cpuid. + */ + { + static int max_cpuid_80000000; + if (!READ_ONCE(max_cpuid_80000000)) + WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); + if (function > READ_ONCE(max_cpuid_80000000)) + return entry; + } + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); @@ -1061,7 +1082,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001f); + entry->eax = min(entry->eax, 0x80000021); + /* + * Serializing LFENCE is reported in a multitude of ways, + * and NullSegClearsBase is not reported in CPUID on Zen2; + * help userspace by providing the CPUID leaf ourselves. + */ + if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) + || !static_cpu_has_bug(X86_BUG_NULL_SEG)) + entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: cpuid_entry_override(entry, CPUID_8000_0001_EDX); @@ -1132,6 +1161,27 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx &= ~GENMASK(11, 6); } break; + case 0x80000020: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + case 0x80000021: + entry->ebx = entry->ecx = entry->edx = 0; + /* + * Pass down these bits: + * EAX 0 NNDBP, Processor ignores nested data breakpoints + * EAX 2 LAS, LFENCE always serializing + * EAX 6 NSCB, Null selector clear base + * + * Other defined bits are for MSRs that KVM does not expose: + * EAX 3 SPCL, SMM page configuration lock + * EAX 13 PCMSR, Prefetch control MSR + */ + entry->eax &= BIT(0) | BIT(2) | BIT(6); + if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) + entry->eax |= BIT(2); + if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) + entry->eax |= BIT(6); + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -1241,8 +1291,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; - array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2), - cpuid->nent)); + array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL); if (!array.entries) return -ENOMEM; @@ -1260,7 +1309,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, r = -EFAULT; out_free: - vfree(array.entries); + kvfree(array.entries); return r; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 50ca1db7247c..7ba4ec77feeb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1623,11 +1623,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - if (!seg_desc.p) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } - dpl = seg_desc.dpl; switch (seg) { @@ -1643,14 +1638,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (!(seg_desc.type & 8)) goto exception; - if (seg_desc.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) + if (transfer == X86_TRANSFER_RET) { + /* RET can never return to an inner privilege level. */ + if (rpl < cpl) goto exception; + /* Outer-privilege level return is not implemented */ + if (rpl > cpl) + return X86EMUL_UNHANDLEABLE; + } + if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) { + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > rpl) + goto exception; + } else { + /* nonconforming */ + if (dpl != rpl) + goto exception; + } + } else { /* X86_TRANSFER_CALL_JMP */ + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } } /* in long-mode d/b must be clear if l is set */ if (seg_desc.d && seg_desc.l) { @@ -1667,6 +1682,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; + if (!seg_desc.p) { + err_vec = NP_VECTOR; + goto exception; + } old_desc = seg_desc; seg_desc.type |= 2; /* busy */ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, @@ -1691,6 +1710,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, break; } + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + if (seg_desc.s) { /* mark segment as accessed */ if (!(seg_desc.type & 1)) { @@ -2216,9 +2240,6 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - /* Outer-privilege level return is not implemented */ - if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) - return X86EMUL_UNHANDLEABLE; rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, X86_TRANSFER_RET, &new_desc); @@ -2615,8 +2636,7 @@ emulate_shutdown: } static void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct desc_struct *cs, struct desc_struct *ss) +setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) { cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ @@ -2705,7 +2725,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) if (!(efer & EFER_SCE)) return emulate_ud(ctxt); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2773,7 +2793,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; @@ -2810,7 +2830,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); if ((ctxt->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -3028,8 +3048,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int task_switch_16(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_16 tss_seg; @@ -3167,8 +3186,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return ret; } -static int task_switch_32(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_32 tss_seg; @@ -3276,10 +3294,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, old_tss_sel = 0xffff; if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); + ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); else - ret = task_switch_16(ctxt, tss_selector, old_tss_sel, + ret = task_switch_16(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6e38a7d22e97..a32f54ab84a2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; + if (!enable_apicv) + return; + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) @@ -1710,32 +1713,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline unsigned long *sparse_set_to_vcpu_mask( - struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, - u64 *vp_bitmap, unsigned long *vcpu_bitmap) +static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, + u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); + bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; + u64 *bitmap; + + BUILD_BUG_ON(sizeof(vp_bitmap) > + sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); + + /* + * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else + * fill a temporary buffer and manually test each vCPU's VP index. + */ + if (likely(!has_mismatch)) + bitmap = (u64 *)vcpu_mask; + else + bitmap = vp_bitmap; - memset(vp_bitmap, 0, - KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + /* + * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask + * having a '1' for each bank that exists in sparse_banks. Sets must + * be in ascending order, i.e. bank0..bankN. + */ + memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) - vp_bitmap[bank] = sparse_banks[sbank++]; + bitmap[bank] = sparse_banks[sbank++]; - if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { - /* for all vcpus vp_index == vcpu_idx */ - return (unsigned long *)vp_bitmap; - } + if (likely(!has_mismatch)) + return; - bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) - __set_bit(i, vcpu_bitmap); + __set_bit(i, vcpu_mask); } - return vcpu_bitmap; } struct kvm_hv_hcall { @@ -1743,6 +1761,7 @@ struct kvm_hv_hcall { u64 ingpa; u64 outgpa; u16 code; + u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; @@ -1750,22 +1769,60 @@ struct kvm_hv_hcall { sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + int consumed_xmm_halves, + u64 *sparse_banks, gpa_t offset) { + u16 var_cnt; int i; - gpa_t gpa; + + if (hc->var_cnt > 64) + return -EINVAL; + + /* Ignore banks that cannot possibly contain a legal VP index. */ + var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); + + if (hc->fast) { + /* + * Each XMM holds two sparse banks, but do not count halves that + * have already been consumed for hypercall parameters. + */ + if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + for (i = 0; i < var_cnt; i++) { + int j = i + consumed_xmm_halves; + if (j % 2) + sparse_banks[i] = sse128_hi(hc->xmm[j / 2]); + else + sparse_banks[i] = sse128_lo(hc->xmm[j / 2]); + } + return 0; + } + + return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, + var_cnt * sizeof(*sparse_banks)); +} + +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) +{ struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; bool all_cpus; - if (!ex) { + /* + * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the + * valid mask is a u64. Fail the build if KVM's max allowed number of + * vCPUs (>4096) would exceed this limit, KVM will additional changes + * for Hyper-V support to avoid setting the guest up to fail. + */ + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { flush.address_space = hc->ingpa; flush.flags = hc->outgpa; @@ -1812,30 +1869,22 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64); + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (!sparse_banks_len && !all_cpus) + if (all_cpus) + goto do_flush; + + if (!hc->var_cnt) goto ret_success; - if (!all_cpus) { - if (hc->fast) { - if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) - return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < sparse_banks_len; i += 2) { - sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); - sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); - } - } else { - gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents); - if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks, - sparse_banks_len * - sizeof(sparse_banks[0])))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; - } - } + if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks, + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; } +do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. @@ -1843,11 +1892,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool if (all_cpus) { kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); } else { - vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - vcpu_mask); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); } ret_success: @@ -1875,21 +1922,18 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); unsigned long valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; - if (!ex) { + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) @@ -1908,9 +1952,15 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { - if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, - sizeof(send_ipi_ex)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (!hc->fast) { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, + sizeof(send_ipi_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } else { + send_ipi_ex.vector = (u32)hc->ingpa; + send_ipi_ex.vp_set.format = hc->outgpa; + send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); + } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, @@ -1918,22 +1968,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); - all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (all_cpus) goto check_and_send_ipi; - if (!sparse_banks_len) + if (!hc->var_cnt) goto ret_success; - if (kvm_read_guest(kvm, - hc->ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) + if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks, + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } @@ -1941,11 +1989,13 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + if (all_cpus) { + kvm_send_ipi_to_many(kvm, vector, NULL); + } else { + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + } ret_success: return HV_STATUS_SUCCESS; @@ -2017,11 +2067,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) return ret; } -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; -} - static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; @@ -2096,6 +2141,7 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + case HVCALL_SEND_IPI_EX: return true; } @@ -2191,19 +2237,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } hc.code = hc.param & 0xffff; + hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, - hc.ingpa, hc.outgpa); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, + hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } + if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + goto hypercall_complete; + } + if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & @@ -2217,14 +2269,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2234,7 +2286,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) { + if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2247,46 +2299,43 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(!hc.rep_cnt || hc.rep_idx)) { + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, &hc, false); - break; - case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(hc.rep)) { + fallthrough; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, &hc, false); + ret = kvm_hv_flush_tlb(vcpu, &hc); break; - case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: - if (unlikely(!hc.rep_cnt || hc.rep_idx)) { + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, &hc, true); - break; + fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, &hc, true); + ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: - if (unlikely(hc.rep)) { + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, &hc, false); - break; + fallthrough; case HVCALL_SEND_IPI_EX: - if (unlikely(hc.fast || hc.rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, &hc, true); + ret = kvm_hv_send_ipi(vcpu, &hc); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: @@ -2417,10 +2466,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); - /* Skip NESTED_FEATURES if eVMCS is not supported */ - if (!evmcs_ver) - --nent; - if (cpuid->nent < nent) return -E2BIG; @@ -2520,8 +2565,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; - if (evmcs_ver) - ent->eax |= HV_X64_NESTED_MSR_BITMAP; + ent->eax |= HV_X64_NESTED_MSR_BITMAP; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index ed1c4e546d04..e19c00ee9ab3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu); +static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 814064d06016..be99dc86293d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr) return ret; } -static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +static void elcr_ioport_write(void *opaque, u32 val) { struct kvm_kpic_state *s = opaque; s->elcr = val & s->elcr_mask; } -static u32 elcr_ioport_read(void *opaque, u32 addr1) +static u32 elcr_ioport_read(void *opaque) { struct kvm_kpic_state *s = opaque; return s->elcr; @@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - elcr_ioport_write(&s->pics[addr & 1], addr, data); + elcr_ioport_write(&s->pics[addr & 1], data); pic_unlock(s); break; default: @@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - *data = elcr_ioport_read(&s->pics[addr & 1], addr); + *data = elcr_ioport_read(&s->pics[addr & 1]); pic_unlock(s); break; default: diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index decfa36b7891..765943d7cfa5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, int trigger_mode, int pin); -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic) { unsigned long result = 0; @@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, break; case IOAPIC_REG_WINDOW: - result = ioapic_read_indirect(ioapic, addr, len); + result = ioapic_read_indirect(ioapic); break; default: diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index b469f45e3fe4..ee4f696a0782 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); + +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ + struct kvm_arch *kvm_arch = &vcpu->kvm->arch; + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_arch->hv_root_tdp_lock); + vcpu->arch.hv_root_tdp = root_tdp; + if (root_tdp != kvm_arch->hv_root_tdp) + kvm_arch->hv_root_tdp = INVALID_PAGE; + spin_unlock(&kvm_arch->hv_root_tdp_lock); + } +} +EXPORT_SYMBOL_GPL(hv_track_root_tdp); diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 1c67abf2eba9..287e98ef9df3 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -10,19 +10,7 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range); int hv_remote_flush_tlb(struct kvm *kvm); - -static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) -{ - struct kvm_arch *kvm_arch = &vcpu->kvm->arch; - - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { - spin_lock(&kvm_arch->hv_root_tdp_lock); - vcpu->arch.hv_root_tdp = root_tdp; - if (root_tdp != kvm_arch->hv_root_tdp) - kvm_arch->hv_root_tdp = INVALID_PAGE; - spin_unlock(&kvm_arch->hv_root_tdp_lock); - } -} +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9322e6340a74..80a2020c4db4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,39 @@ static bool lapic_timer_advance_dynamic __read_mostly; /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) +{ + *((u32 *) (regs + reg_off)) = val; +} + +static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) +{ + __kvm_lapic_set_reg(apic->regs, reg_off, val); +} + +static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg) +{ + BUILD_BUG_ON(reg != APIC_ICR); + return *((u64 *) (regs + reg)); +} + +static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg) +{ + return __kvm_lapic_get_reg64(apic->regs, reg); +} + +static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val) +{ + BUILD_BUG_ON(reg != APIC_ICR); + *((u64 *) (regs + reg)) = val; +} + +static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic, + int reg, u64 val) +{ + __kvm_lapic_set_reg64(apic->regs, reg, val); +} + static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -113,7 +146,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { - return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && + (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) @@ -491,8 +525,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -522,7 +555,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, vec); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -570,8 +603,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -1276,6 +1308,9 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { struct kvm_lapic_irq irq; + /* KVM has no delay and should always clear the BUSY/PENDING flag. */ + WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); + irq.vector = icr_low & APIC_VECTOR_MASK; irq.delivery_mode = icr_low & APIC_MODE_MASK; irq.dest_mode = icr_low & APIC_DEST_MASK; @@ -1292,6 +1327,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } +EXPORT_SYMBOL_GPL(kvm_apic_send_ipi); static u32 apic_get_tmcct(struct kvm_lapic *apic) { @@ -1375,8 +1411,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) -int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, - void *data) +static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) { unsigned char alignment = offset & 0xf; u32 result; @@ -1394,7 +1430,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) | APIC_REG_MASK(APIC_ESR) | APIC_REG_MASK(APIC_ICR) | - APIC_REG_MASK(APIC_ICR2) | APIC_REG_MASK(APIC_LVTT) | APIC_REG_MASK(APIC_LVTTHMR) | APIC_REG_MASK(APIC_LVTPC) | @@ -1405,9 +1440,16 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); - /* ARBPRI is not valid on x2APIC */ + /* + * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR + * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be + * manually handled by the caller. + */ if (!apic_x2apic_mode(apic)) - valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); + valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | + APIC_REG_MASK(APIC_ICR2); + else + WARN_ON_ONCE(offset == APIC_ICR); if (alignment + len > 4) return 1; @@ -1432,7 +1474,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, } return 0; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { @@ -1993,7 +2034,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } -int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) +static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2052,16 +2093,18 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; } case APIC_ICR: + WARN_ON_ONCE(apic_x2apic_mode(apic)); + /* No delay here, so we always clear the pending bit */ - val &= ~(1 << 12); + val &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); kvm_lapic_set_reg(apic, APIC_ICR, val); break; - case APIC_ICR2: - if (!apic_x2apic_mode(apic)) - val &= 0xff000000; - kvm_lapic_set_reg(apic, APIC_ICR2, val); + if (apic_x2apic_mode(apic)) + ret = 1; + else + kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; case APIC_LVT0: @@ -2121,10 +2164,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_SELF_IPI: - if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, - APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); - } else + if (apic_x2apic_mode(apic)) + kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0); + else ret = 1; break; default: @@ -2132,11 +2174,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; } + /* + * Recalculate APIC maps if necessary, e.g. if the software enable bit + * was toggled, the APIC ID changed, etc... The maps are marked dirty + * on relevant changes, i.e. this is a nop for most writes. + */ kvm_recalculate_apic_map(apic->vcpu->kvm); return ret; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) @@ -2180,12 +2226,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = 0; - - /* hw has done the conditional check and inst decode */ - offset &= 0xff0; - - kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val); + u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); /* TODO: optimize to just emulate side effect w/o one more write */ kvm_lapic_reg_write(vcpu->arch.apic, offset, val); @@ -2242,10 +2283,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { - struct kvm_lapic *apic = vcpu->arch.apic; - - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4)); + apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) @@ -2287,7 +2325,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - static_call(kvm_x86_set_virtual_apic_mode)(vcpu); + static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2356,8 +2394,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, 0); kvm_lapic_set_reg(apic, APIC_ESR, 0); - kvm_lapic_set_reg(apic, APIC_ICR, 0); - kvm_lapic_set_reg(apic, APIC_ICR2, 0); + if (!apic_x2apic_mode(apic)) { + kvm_lapic_set_reg(apic, APIC_ICR, 0); + kvm_lapic_set_reg(apic, APIC_ICR2, 0); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, 0); + } kvm_lapic_set_reg(apic, APIC_TDCR, 0); kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { @@ -2373,9 +2415,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2574,6 +2616,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, if (apic_x2apic_mode(vcpu->arch.apic)) { u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); + u64 icr; if (vcpu->kvm->arch.x2apic_format) { if (*id != vcpu->vcpu_id) @@ -2585,9 +2628,21 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, *id <<= 24; } - /* In x2APIC mode, the LDR is fixed and based on the id */ - if (set) + /* + * In x2APIC mode, the LDR is fixed and based on the id. And + * ICR is internally a single 64-bit register, but needs to be + * split to ICR+ICR2 in userspace for backwards compatibility. + */ + if (set) { *ldr = kvm_apic_calc_x2apic_ldr(*id); + + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } return 0; @@ -2638,11 +2693,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -2779,73 +2832,85 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) return 0; } -int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) { - struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = (msr - APIC_BASE_MSR) << 4; + data &= ~APIC_ICR_BUSY; - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) - return 1; + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + kvm_lapic_set_reg64(apic, APIC_ICR, data); + trace_kvm_apic_write(APIC_ICR, data); + return 0; +} + +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) +{ + u32 low; + + if (reg == APIC_ICR) { + *data = kvm_lapic_get_reg64(apic, APIC_ICR); + return 0; + } - if (reg == APIC_ICR2) + if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; - /* if this is ICR write vector before command */ + *data = low; + + return 0; +} + +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data) +{ + /* + * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and + * can be written as such, all other registers remain accessible only + * through 32-bit reads/writes. + */ if (reg == APIC_ICR) - kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_x2apic_icr_write(apic, data); + return kvm_lapic_reg_write(apic, reg, (u32)data); } -int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; + u32 reg = (msr - APIC_BASE_MSR) << 4; if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_DFR || reg == APIC_ICR2) - return 1; + return kvm_lapic_msr_write(apic, reg, data); +} - if (kvm_lapic_reg_read(apic, reg, 4, &low)) +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4; + + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_ICR) - kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); - *data = (((u64)high) << 32) | low; + if (reg == APIC_DFR) + return 1; - return 0; + return kvm_lapic_msr_read(apic, reg, data); } int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { - struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu)) return 1; - /* if this is ICR write vector before command */ - if (reg == APIC_ICR) - kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(apic, reg, (u32)data); + return kvm_lapic_msr_write(vcpu->arch.apic, reg, data); } int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) { - struct kvm_lapic *apic = vcpu->arch.apic; - u32 low, high = 0; - if (!lapic_in_kernel(vcpu)) return 1; - if (kvm_lapic_reg_read(apic, reg, 4, &low)) - return 1; - if (reg == APIC_ICR) - kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); - - *data = (((u64)high) << 32) | low; - - return 0; + return kvm_lapic_msr_read(vcpu->arch.apic, reg, data); } int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) @@ -2933,7 +2998,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); + static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2b44e533fc8d..4e4f8a22754f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -85,9 +85,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); -int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, - void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); @@ -121,6 +118,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); @@ -153,19 +151,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } -static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) +static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off) { - return *((u32 *) (apic->regs + reg_off)); + return *((u32 *) (regs + reg_off)); } -static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) -{ - *((u32 *) (regs + reg_off)) = val; -} - -static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) +static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) { - __kvm_lapic_set_reg(apic->regs, reg_off, val); + return __kvm_lapic_get_reg(apic->regs, reg_off); } DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9fbb2c8bbe2..bf8dbc4bb12a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -48,6 +48,7 @@ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) +#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX) static __always_inline u64 rsvd_bits(int s, int e) { @@ -79,12 +80,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { - if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) + if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE)) return 0; return kvm_mmu_load(vcpu); @@ -106,7 +108,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { - u64 root_hpa = vcpu->arch.mmu->root_hpa; + u64 root_hpa = vcpu->arch.mmu->root.hpa; if (!VALID_PAGE(root_hpa)) return; @@ -203,44 +205,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Currently, we have two sorts of write-protection, a) the first one - * write-protects guest page to sync the guest modification, b) another one is - * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences - * between these two sorts are: - * 1) the first case clears MMU-writable bit. - * 2) the first case requires flushing tlb immediately avoiding corrupting - * shadow page table between all vcpus so it should be in the protection of - * mmu-lock. And the another case does not need to flush tlb until returning - * the dirty bitmap to userspace since it only write-protects the page - * logged in the bitmap, that means the page in the dirty bitmap is not - * missed, so it can flush tlb out of mmu-lock. - * - * So, there is the problem: the first case can meet the corrupted tlb caused - * by another case which write-protects pages but without flush tlb - * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose MMU-writable bit - * is set, it works since another case never touches MMU-writable bit. - * - * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with MMU-writable becomes - * readonly, if that happens, we need to flush tlb. Fortunately, - * mmu_spte_update() has already handled it perfectly. - * - * The rules to use MMU-writable and PT_WRITABLE_MASK: - * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check MMU-writable, this is the most - * case, otherwise - * - if we fix page fault on the spte or do write-protection by dirty logging, - * check PT_WRITABLE_MASK. - * - * TODO: introduce APIs to split these two cases. - */ -static inline bool is_writable_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -/* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE * access rights (in ACC_* format). diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5628d0ba637e..51671cb34fb6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -enum { - AUDIT_PRE_PAGE_FAULT, - AUDIT_POST_PAGE_FAULT, - AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE, - AUDIT_PRE_SYNC, - AUDIT_POST_SYNC -}; - #ifdef MMU_DEBUG bool dbg = 0; module_param(dbg, bool, 0644); @@ -190,8 +181,6 @@ struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); -static union kvm_mmu_page_role -kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); struct kvm_mmu_role_regs { const unsigned long cr0; @@ -529,6 +518,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; WARN_ON(!is_shadow_present_pte(new_spte)); + check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -548,11 +538,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. + * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote + * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only + * spte, even though the writable spte might be cached on a CPU's TLB. * * Returns true if the TLB needs to be flushed */ @@ -646,24 +634,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Restore an acc-track PTE back to a regular PTE */ -static u64 restore_acc_track_spte(u64 spte) -{ - u64 new_spte = spte; - u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) - & SHADOW_ACC_TRACK_SAVED_BITS_MASK; - - WARN_ON_ONCE(spte_ad_enabled(spte)); - WARN_ON_ONCE(!is_access_track_spte(spte)); - - new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << - SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); - new_spte |= saved_bits; - - return new_spte; -} - /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { @@ -1229,9 +1199,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - bool pt_protect) +static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1311,7 +1280,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_write_protect(kvm, rmap_head, false); + rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1378,6 +1347,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + if (READ_ONCE(eager_page_split)) + kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ @@ -1410,7 +1382,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + write_protected |= rmap_write_protect(rmap_head, true); } } @@ -1421,7 +1393,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, return write_protected; } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; @@ -1921,13 +1893,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } -static void mmu_audit_disable(void) { } -#endif - static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) @@ -2024,7 +1989,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu, sp->gfn); + protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); @@ -2149,7 +2114,7 @@ trace_get_page: hlist_add_head(&sp->hash_link, sp_list); if (!direct) { account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); } trace_kvm_mmu_get_page(sp, true); @@ -2179,7 +2144,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ - BUG_ON(root != vcpu->arch.mmu->root_hpa); + BUG_ON(root != vcpu->arch.mmu->root.hpa); iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; @@ -2193,7 +2158,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { - shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa, addr); } @@ -2307,7 +2272,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; @@ -2345,13 +2310,13 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, struct list_head *invalid_list, int *nr_zapped) { - bool list_unstable; + bool list_unstable, zapped_root = false; trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(kvm, sp); + kvm_mmu_unlink_parents(sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -2387,14 +2352,20 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also * treats invalid shadow pages as being obsolete. */ - if (!is_obsolete_sp(kvm, sp)) - kvm_reload_remote_mmus(kvm); + zapped_root = !is_obsolete_sp(kvm, sp); } if (sp->lpage_disallowed) unaccount_huge_nx_page(kvm, sp); sp->role.invalid = 1; + + /* + * Make the request to free obsolete roots after marking the root + * invalid, otherwise other vCPUs may not see it as invalid. + */ + if (zapped_root) + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); return list_unstable; } @@ -3239,6 +3210,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + if (WARN_ON(!sp)) + return; if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_put_root(kvm, sp, false); @@ -3249,18 +3222,20 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { - struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); - bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; + bool free_active_root; BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ - if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { + free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) + && VALID_PAGE(mmu->root.hpa); + + if (!free_active_root) { for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && VALID_PAGE(mmu->prev_roots[i].hpa)) @@ -3278,9 +3253,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, &invalid_list); if (free_active_root) { - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); + if (to_shadow_page(mmu->root.hpa)) { + mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) @@ -3291,8 +3265,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->pae_root[i] = INVALID_PAE_ROOT; } } - mmu->root_hpa = INVALID_PAGE; - mmu->root_pgd = 0; + mmu->root.hpa = INVALID_PAGE; + mmu->root.pgd = 0; } kvm_mmu_commit_zap_page(kvm, &invalid_list); @@ -3300,7 +3274,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); -void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; hpa_t root_hpa; @@ -3322,7 +3296,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } - kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + kvm_mmu_free_roots(kvm, mmu, roots_to_free); } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); @@ -3365,10 +3339,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - mmu->root_hpa = root; + mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); - mmu->root_hpa = root; + mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; @@ -3383,15 +3357,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_mask; } - mmu->root_hpa = __pa(mmu->pae_root); + mmu->root.hpa = __pa(mmu->pae_root); } else { WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); r = -EIO; goto out_unlock; } - /* root_pgd is ignored for direct MMUs. */ - mmu->root_pgd = 0; + /* root.pgd is ignored for direct MMUs. */ + mmu->root.pgd = 0; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; @@ -3504,7 +3478,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu->root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->shadow_root_level, false); - mmu->root_hpa = root; + mmu->root.hpa = root; goto set_root_pgd; } @@ -3554,14 +3528,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) - mmu->root_hpa = __pa(mmu->pml5_root); + mmu->root.hpa = __pa(mmu->pml5_root); else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) - mmu->root_hpa = __pa(mmu->pml4_root); + mmu->root.hpa = __pa(mmu->pml4_root); else - mmu->root_hpa = __pa(mmu->pae_root); + mmu->root.hpa = __pa(mmu->pae_root); set_root_pgd: - mmu->root_pgd = root_pgd; + mmu->root.pgd = root_pgd; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); @@ -3660,6 +3634,14 @@ static bool is_unsync_root(hpa_t root) */ smp_rmb(); sp = to_shadow_page(root); + + /* + * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the + * PDPTEs for a given PAE root need to be synchronized individually. + */ + if (WARN_ON_ONCE(!sp)) + return false; + if (sp->unsync || sp->unsync_children) return true; @@ -3674,30 +3656,25 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->direct_map) return; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; + hpa_t root = vcpu->arch.mmu->root.hpa; sp = to_shadow_page(root); if (!is_unsync_root(root)) return; write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp, true); - - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; @@ -3709,7 +3686,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); } @@ -3723,7 +3699,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); /* sync prev_roots by simply freeing them */ - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, @@ -3982,7 +3958,7 @@ out_retry: static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int mmu_seq) { - struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) @@ -3996,7 +3972,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs * to reload even if no vCPU is actively using the root. */ - if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) + if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; return fault->slot && @@ -4132,74 +4108,105 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + VALID_PAGE(root->hpa) && role.word == to_shadow_page(root->hpa)->role.word; } /* - * Find out if a previously cached root matching the new pgd/role is available. - * The current root is also inserted into the cache. - * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is - * returned. - * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and - * false is returned. This root should now be freed by the caller. + * Find out if a previously cached root matching the new pgd/role is available, + * and insert the current root as the MRU in the cache. + * If a matching root is found, it is assigned to kvm_mmu->root and + * true is returned. + * If no match is found, kvm_mmu->root is left invalid, the LRU root is + * evicted to make room for the current root, and false is returned. */ -static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role) +static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu, + gpa_t new_pgd, + union kvm_mmu_page_role new_role) { uint i; - struct kvm_mmu_root_info root; - struct kvm_mmu *mmu = vcpu->arch.mmu; - - root.pgd = mmu->root_pgd; - root.hpa = mmu->root_hpa; - if (is_root_usable(&root, new_pgd, new_role)) + if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { - swap(root, mmu->prev_roots[i]); - - if (is_root_usable(&root, new_pgd, new_role)) - break; + /* + * The swaps end up rotating the cache like this: + * C 0 1 2 3 (on entry to the function) + * 0 C 1 2 3 + * 1 C 0 2 3 + * 2 C 0 1 3 + * 3 C 0 1 2 (on exit from the loop) + */ + swap(mmu->root, mmu->prev_roots[i]); + if (is_root_usable(&mmu->root, new_pgd, new_role)) + return true; } - mmu->root_hpa = root.hpa; - mmu->root_pgd = root.pgd; - - return i < KVM_MMU_NUM_PREV_ROOTS; + kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); + return false; } -static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role) +/* + * Find out if a previously cached root matching the new pgd/role is available. + * On entry, mmu->root is invalid. + * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry + * of the cache becomes invalid, and true is returned. + * If no match is found, kvm_mmu->root is left invalid and false is returned. + */ +static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu, + gpa_t new_pgd, + union kvm_mmu_page_role new_role) { - struct kvm_mmu *mmu = vcpu->arch.mmu; + uint i; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) + goto hit; + + return false; +hit: + swap(mmu->root, mmu->prev_roots[i]); + /* Bubble up the remaining roots. */ + for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++) + mmu->prev_roots[i] = mmu->prev_roots[i + 1]; + mmu->prev_roots[i].hpa = INVALID_PAGE; + return true; +} + +static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, + gpa_t new_pgd, union kvm_mmu_page_role new_role) +{ /* - * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid + * For now, limit the caching to 64-bit hosts+VMs in order to avoid * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs * later if necessary. */ - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - mmu->root_level >= PT64_ROOT_4LEVEL) - return cached_root_available(vcpu, new_pgd, new_role); + if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa)) + kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); - return false; + if (VALID_PAGE(mmu->root.hpa)) + return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role); + else + return cached_root_find_without_current(kvm, mmu, new_pgd, new_role); } -static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { - if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); + struct kvm_mmu *mmu = vcpu->arch.mmu; + union kvm_mmu_page_role new_role = mmu->mmu_role.base; + + if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { + /* kvm_mmu_ensure_valid_pgd will set up a new root. */ return; } /* * It's possible that the cached previous root page is obsolete because * of a change in the MMU generation number. However, changing the - * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will - * free the root set here and allocate a new one. + * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, + * which will free the root set here and allocate a new one. */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); @@ -4222,12 +4229,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (!new_role.direct) __clear_sp_write_flooding_count( - to_shadow_page(vcpu->arch.mmu->root_hpa)); -} - -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) -{ - __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); + to_shadow_page(vcpu->arch.mmu->root.hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -4485,8 +4487,7 @@ static inline bool boot_cpu_is_amd(void) * possible, however, kvm currently does not do execution-protection. */ static void -reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4517,8 +4518,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * is the shadow page table for intel nested guest. */ static void -reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, @@ -4805,7 +4805,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); - reset_tdp_shadow_zero_bits_mask(vcpu, context); + reset_tdp_shadow_zero_bits_mask(context); } static union kvm_mmu_role @@ -4899,9 +4899,8 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); - shadow_mmu_init_context(vcpu, context, ®s, new_role); + kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4939,27 +4938,25 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; - - context->mmu_role.as_u64 = new_role.as_u64; - - context->shadow_root_level = level; - - context->ept_ad = accessed_dirty; - context->page_fault = ept_page_fault; - context->gva_to_gpa = ept_gva_to_gpa; - context->sync_page = ept_sync_page; - context->invlpg = ept_invlpg; - context->root_level = level; - context->direct_map = false; + if (new_role.as_u64 != context->mmu_role.as_u64) { + context->mmu_role.as_u64 = new_role.as_u64; + + context->shadow_root_level = level; + + context->ept_ad = accessed_dirty; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->root_level = level; + context->direct_map = false; + update_permission_bitmask(context, true); + context->pkru_mask = 0; + reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); + reset_ept_shadow_zero_bits_mask(context, execonly); + } - update_permission_bitmask(context, true); - context->pkru_mask = 0; - reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); - reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); + kvm_mmu_new_pgd(vcpu, new_eptp); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -5044,20 +5041,6 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_init_mmu); -static union kvm_mmu_page_role -kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - union kvm_mmu_role role; - - if (tdp_enabled) - role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); - else - role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); - - return role.base; -} - void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { /* @@ -5111,17 +5094,73 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); - static_call(kvm_x86_tlb_flush_current)(vcpu); + + /* + * Flush any TLB entries for the new root, the provenance of the root + * is unknown. Even if KVM ensures there are no stale TLB entries + * for a freed root, in theory another hypervisor could have left + * stale entries. Flushing on alloc also allows KVM to skip the TLB + * flush when freeing a root (see kvm_tdp_mmu_put_root()). + */ + static_call(kvm_x86_flush_tlb_current)(vcpu); out: return r; } void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); + struct kvm *kvm = vcpu->kvm; + + kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); + kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); +} + +static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) +{ + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root_hpa)) + return false; + + /* + * When freeing obsolete roots, treat roots as obsolete if they don't + * have an associated shadow page. This does mean KVM will get false + * positives and free roots that don't strictly need to be freed, but + * such false positives are relatively rare: + * + * (a) only PAE paging and nested NPT has roots without shadow pages + * (b) remote reloads due to a memslot update obsoletes _all_ roots + * (c) KVM doesn't track previous roots for PAE paging, and the guest + * is unlikely to zap an in-use PGD. + */ + sp = to_shadow_page(root_hpa); + return !sp || is_obsolete_sp(kvm, sp); +} + +static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) +{ + unsigned long roots_to_free = 0; + int i; + + if (is_obsolete_root(kvm, mmu->root.hpa)) + roots_to_free |= KVM_MMU_ROOT_CURRENT; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (is_obsolete_root(kvm, mmu->root.hpa)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + + if (roots_to_free) + kvm_mmu_free_roots(kvm, mmu, roots_to_free); +} + +void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) +{ + __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); + __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } static bool need_remote_flush(u64 old, u64 new) @@ -5271,7 +5310,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || @@ -5296,7 +5334,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); - kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5306,7 +5343,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->direct_map; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; r = RET_PF_INVALID; @@ -5371,14 +5408,14 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); } if (!mmu->invlpg) return; if (root_hpa == INVALID_PAGE) { - mmu->invlpg(vcpu, gva, mmu->root_hpa); + mmu->invlpg(vcpu, gva, mmu->root.hpa); /* * INVLPG is required to invalidate any global mappings for the VA, @@ -5414,7 +5451,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) uint i; if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva, mmu->root_hpa); + mmu->invlpg(vcpu, gva, mmu->root.hpa); tlb_flush = true; } @@ -5427,7 +5464,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); ++vcpu->stat.invlpg; @@ -5527,8 +5564,8 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) struct page *page; int i; - mmu->root_hpa = INVALID_PAGE; - mmu->root_pgd = 0; + mmu->root.hpa = INVALID_PAGE; + mmu->root.pgd = 0; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5648,9 +5685,13 @@ restart: } /* - * Trigger a remote TLB flush before freeing the page tables to ensure - * KVM is not in the middle of a lockless shadow page table walk, which - * may reference the pages. + * Kick all vCPUs (via remote TLB flush) before freeing the page tables + * to ensure KVM is not in the middle of a lockless shadow page table + * walk, which may reference the pages. The remote TLB flush itself is + * not required and is simply a convenient way to kick vCPUs as needed. + * KVM performs a local TLB flush when allocating a new root (see + * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are + * running with an obsolete MMU. */ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } @@ -5680,11 +5721,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; - /* In order to ensure all threads see this change when - * handling the MMU reload signal, this must happen in the - * same critical section as kvm_reload_remote_mmus, and - * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages - * could drop the MMU lock and yield. + /* + * In order to ensure all vCPUs drop their soon-to-be invalid roots, + * invalidating TDP MMU roots must be done while holding mmu_lock for + * write and in the same critical section as making the reload request, + * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_invalidate_all_roots(kvm); @@ -5697,17 +5738,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Note: we need to do this under the protection of mmu_lock, * otherwise, vcpu would purge shadow page but miss tlb flush. */ - kvm_reload_remote_mmus(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); kvm_zap_obsolete_pages(kvm); write_unlock(&kvm->mmu_lock); - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); + /* + * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before + * returning to the caller, e.g. if the zap is in response to a memslot + * deletion, mmu_notifier callbacks will be unable to reach the SPTEs + * associated with the deleted memslot once the update completes, and + * Deferring the zap until the final reference to the root is put would + * lead to use-after-free. + */ + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_invalidated_roots(kvm); - read_unlock(&kvm->mmu_lock); - } } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5813,7 +5859,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return __rmap_write_protect(kvm, rmap_head, false); + return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -5857,12 +5903,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * will clear a separate software-only bit (MMU-writable) and skip the * flush if-and-only-if this bit was already clear. * - * See DEFAULT_SPTE_MMU_WRITEABLE for more details. + * See is_writable_pte() for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +/* Must be called with the mmu_lock held in write-mode. */ +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level) +{ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, + target_level, false); + + /* + * A TLB flush is unnecessary at this point for the same resons as in + * kvm_mmu_slot_try_split_huge_pages(). + */ +} + +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level) +{ + u64 start = memslot->base_gfn; + u64 end = start + memslot->npages; + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + } + + /* + * No TLB flush is necessary here. KVM will flush TLBs after + * write-protecting and/or clearing dirty on the newly split SPTEs to + * ensure that guest writes are reflected in the dirty log before the + * ioctl to enable dirty logging on this memslot completes. Since the + * split SPTEs retain the write and dirty bits of the huge SPTE, it is + * safe for KVM to decide if a TLB flush is necessary based on the split + * SPTEs. + */ +} + static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) @@ -6202,7 +6288,6 @@ void kvm_mmu_module_exit(void) mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); - mmu_audit_disable(); } /* @@ -6272,6 +6357,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); + /* + * Zapping TDP MMU shadow pages, including the remote TLB flush, must + * be done under RCU protection, because the pages are freed via RCU + * callback. + */ + rcu_read_lock(); + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { @@ -6296,12 +6388,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + rcu_read_unlock(); + cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; + + rcu_read_lock(); } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + rcu_read_unlock(); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c deleted file mode 100644 index 9e7dcf999f08..000000000000 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mmu_audit.c: - * - * Audit code for KVM MMU - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * Marcelo Tosatti <mtosatti@redhat.com> - * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> - */ - -#include <linux/ratelimit.h> - -static char const *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - -#define audit_printk(kvm, fmt, args...) \ - printk(KERN_ERR "audit: (%s) error: " \ - fmt, audit_point_name[kvm->arch.audit_point], ##args) - -typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); - -static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - inspect_spte_fn fn, int level) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 *ent = sp->spt; - - fn(vcpu, ent + i, level); - - if (is_shadow_present_pte(ent[i]) && - !is_last_spte(ent[i], level)) { - struct kvm_mmu_page *child; - - child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(vcpu, child, fn, level - 1); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; - - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); - return; - } - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; - - if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, 2); - } - } - - return; -} - -typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); - -static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) - fn(kvm, sp); -} - -static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - kvm_pfn_t pfn; - hpa_t hpa; - - sp = sptep_to_sp(sptep); - - if (sp->unsync) { - if (level != PG_LEVEL_4K) { - audit_printk(vcpu->kvm, "unsync sp: %p " - "level = %d\n", sp, level); - return; - } - } - - if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); - - if (is_error_pfn(pfn)) - return; - - hpa = pfn << PAGE_SHIFT; - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu->root_level, pfn, - hpa, *sptep); -} - -static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - struct kvm_rmap_head *rmap_head; - struct kvm_mmu_page *rev_sp; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t gfn; - - rev_sp = sptep_to_sp(sptep); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); - slot = __gfn_to_memslot(slots, gfn); - if (!slot) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no memslot for gfn %llx\n", gfn); - audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", - (long int)(sptep - rev_sp->spt), rev_sp->gfn); - dump_stack(); - return; - } - - rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); - if (!rmap_head->val) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no rmap for writable spte %llx\n", - *sptep); - dump_stack(); - } -} - -static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) - inspect_spte_has_rmap(vcpu->kvm, sptep); -} - -static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp = sptep_to_sp(sptep); - - if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) - audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " - "root.\n", sp); -} - -static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - int i; - - if (sp->role.level != PG_LEVEL_4K) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(sp->spt[i])) - continue; - - inspect_spte_has_rmap(kvm, sp->spt + i); - } -} - -static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct kvm_rmap_head *rmap_head; - u64 *sptep; - struct rmap_iterator iter; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - - if (sp->role.direct || sp->unsync || sp->role.invalid) - return; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (is_writable_pte(*sptep)) - audit_printk(kvm, "shadow page has writable " - "mappings: gfn %llx role %x\n", - sp->gfn, sp->role.word); - } -} - -static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - check_mappings_rmap(kvm, sp); - audit_write_protection(kvm, sp); -} - -static void audit_all_active_sps(struct kvm *kvm) -{ - walk_all_active_sps(kvm, audit_sp); -} - -static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - audit_sptes_have_rmaps(vcpu, sptep, level); - audit_mappings(vcpu, sptep, level); - audit_spte_after_sync(vcpu, sptep, level); -} - -static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, audit_spte); -} - -static bool mmu_audit; -static DEFINE_STATIC_KEY_FALSE(mmu_audit_key); - -static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!__ratelimit(&ratelimit_state)) - return; - - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); -} - -static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - if (static_branch_unlikely((&mmu_audit_key))) - __kvm_mmu_audit(vcpu, point); -} - -static void mmu_audit_enable(void) -{ - if (mmu_audit) - return; - - static_branch_inc(&mmu_audit_key); - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - static_branch_dec(&mmu_audit_key); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = kstrtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static const struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index da6166b5c377..1bff453f7cbe 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -30,6 +30,8 @@ extern bool dbg; #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) +typedef u64 __rcu *tdp_ptep_t; + struct kvm_mmu_page { /* * Note, "link" through "spt" fit in a single 64 byte cache line on @@ -59,8 +61,17 @@ struct kvm_mmu_page { refcount_t tdp_mmu_root_count; }; unsigned int unsync_children; - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - DECLARE_BITMAP(unsync_child_bitmap, 512); + union { + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + tdp_ptep_t ptep; + }; + union { + DECLARE_BITMAP(unsync_child_bitmap, 512); + struct { + struct work_struct tdp_mmu_async_work; + void *tdp_mmu_async_data; + }; + }; struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index de5e8e4e1aa7..12247b96af01 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -416,6 +416,29 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_mmu_split_huge_page, + TP_PROTO(u64 gfn, u64 spte, int level, int errno), + TP_ARGS(gfn, spte, level, errno), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(int, level) + __field(int, errno) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = spte; + __entry->level = level; + __entry->errno = errno; + ), + + TP_printk("gfn %llx spte %llx level %d errno %d", + __entry->gfn, __entry->spte, __entry->level, __entry->errno) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 68eb1fb548b6..2e09d1b6249f 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -47,8 +47,8 @@ int kvm_page_track_create_memslot(struct kvm *kvm, continue; slot->arch.gfn_track[i] = - kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL_ACCOUNT); + __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]), + GFP_KERNEL_ACCOUNT); if (!slot->arch.gfn_track[i]) goto track_free; } @@ -75,7 +75,8 @@ int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) return 0; - gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT); + gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track), + GFP_KERNEL_ACCOUNT); if (gfn_track == NULL) return -ENOMEM; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5b5bdac97c7b..252c77805eb9 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -668,7 +668,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, fault->addr); @@ -904,12 +904,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = FNAME(fetch)(vcpu, fault, &walker); - kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 73cfe62fdad1..4739b53c9734 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -192,6 +192,65 @@ out: return wrprot; } +static u64 make_spte_executable(u64 spte) +{ + bool is_access_track = is_access_track_spte(spte); + + if (is_access_track) + spte = restore_acc_track_spte(spte); + + spte &= ~shadow_nx_mask; + spte |= shadow_x_mask; + + if (is_access_track) + spte = mark_spte_for_access_track(spte); + + return spte; +} + +/* + * Construct an SPTE that maps a sub-page of the given huge page SPTE where + * `index` identifies which sub-page. + * + * This is used during huge page splitting to build the SPTEs that make up the + * new page table. + */ +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +{ + u64 child_spte; + int child_level; + + if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) + return 0; + + if (WARN_ON_ONCE(!is_large_pte(huge_spte))) + return 0; + + child_spte = huge_spte; + child_level = huge_level - 1; + + /* + * The child_spte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + + if (child_level == PG_LEVEL_4K) { + child_spte &= ~PT_PAGE_SIZE_MASK; + + /* + * When splitting to a 4K page, mark the page executable as the + * NX hugepage mitigation no longer applies. + */ + if (is_nx_huge_page_enabled()) + child_spte = make_spte_executable(child_spte); + } + + return child_spte; +} + + u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte = SPTE_MMU_PRESENT_MASK; @@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte) if (is_access_track_spte(spte)) return spte; - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + check_spte_writable_invariants(spte); WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), @@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_acc_track_mask = 0; shadow_me_mask = sme_me_mask; - shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; - shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index be6a007a4af3..73f12615416f 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* - * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits - * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs - * that map guest pages in read-only memslots and read-only VMAs. - * - * Invariants: - * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. - * - * - * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU - * allows writes to the guest page mapped by the SPTE. This bit is cleared when - * the guest page mapped by the SPTE contains a page table that is being - * monitored for shadow paging. In this case the SPTE can only be made writable - * by unsyncing the shadow page under the mmu_lock. - * - * Invariants: - * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. - * - If MMU-writable is set, Host-writable must be set. - * - * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared - * to track writes for dirty logging. For such SPTEs, KVM will locklessly set - * PT_WRITABLE_MASK upon the next write from the guest and record the write in - * the dirty log (see fast_page_fault()). + * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given + * SPTE is write-protected. See is_writable_pte() for details. */ /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) +#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care @@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, __is_rsvd_bits_set(rsvd_check, spte, level); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +/* + * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * + * 1. To intercept writes for dirty logging. KVM write-protects huge pages + * so that they can be split be split down into the dirty logging + * granularity (4KiB) whenever the guest writes to them. KVM also + * write-protects 4KiB pages so that writes can be recorded in the dirty log + * (e.g. if not using PML). SPTEs are write-protected for dirty logging + * during the VM-iotcls that enable dirty logging. + * + * 2. To intercept writes to guest page tables that KVM is shadowing. When a + * guest writes to its page table the corresponding shadow page table will + * be marked "unsync". That way KVM knows which shadow page tables need to + * be updated on the next TLB flush, INVLPG, etc. and which do not. + * + * 3. To prevent guest writes to read-only memory, such as for memory in a + * read-only memslot or guest memory backed by a read-only VMA. Writes to + * such pages are disallowed entirely. + * + * To keep track of why a given SPTE is write-protected, KVM uses 2 + * software-only bits in the SPTE: + * + * shadow_mmu_writable_mask, aka MMU-writable - + * Cleared on SPTEs that KVM is currently write-protecting for shadow paging + * purposes (case 2 above). + * + * shadow_host_writable_mask, aka Host-writable - + * Cleared on SPTEs that are not host-writable (case 3 above) + * + * Note, not all possible combinations of PT_WRITABLE_MASK, + * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given + * SPTE can be in only one of the following states, which map to the + * aforementioned 3 cases: + * + * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK + * ------------------------- | ------------------------ | ---------------- + * 1 | 1 | 1 (writable) + * 1 | 1 | 0 (case 1) + * 1 | 0 | 0 (case 2) + * 0 | 0 | 0 (case 3) + * + * The valid combinations of these bits are checked by + * check_spte_writable_invariants() whenever an SPTE is modified. + * + * Clearing the MMU-writable bit is always done under the MMU lock and always + * accompanied by a TLB flush before dropping the lock to avoid corrupting the + * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging + * (which does not clear the MMU-writable bit), does not flush TLBs before + * dropping the lock, as it only needs to synchronize guest writes with the + * dirty bitmap. + * + * So, there is the problem: clearing the MMU-writable bit can encounter a + * write-protected SPTE while CPUs still have writable mappings for that SPTE + * cached in their TLB. To address this, KVM always flushes TLBs when + * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. + * + * The Host-writable bit is not modified on present SPTEs, it is only set or + * cleared when an SPTE is first faulted in from non-present and then remains + * immutable. + */ +static inline bool is_writable_pte(unsigned long pte) { - if (spte & shadow_mmu_writable_mask) { - WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); - return true; - } + return pte & PT_WRITABLE_MASK; +} + +/* Note: spte must be a shadow-present leaf SPTE. */ +static inline void check_spte_writable_invariants(u64 spte) +{ + if (spte & shadow_mmu_writable_mask) + WARN_ONCE(!(spte & shadow_host_writable_mask), + "kvm: MMU-writable SPTE is not Host-writable: %llx", + spte); + else + WARN_ONCE(is_writable_pte(spte), + "kvm: Writable SPTE is not MMU-writable: %llx", spte); +} - WARN_ON_ONCE(spte & PT_WRITABLE_MASK); - return false; +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return spte & shadow_mmu_writable_mask; } static inline u64 get_mmio_spte_generation(u64 spte) @@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); + +/* Restore an acc-track PTE back to a regular PTE */ +static inline u64 restore_acc_track_spte(u64 spte) +{ + u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) + & SHADOW_ACC_TRACK_SAVED_BITS_MASK; + + spte &= ~shadow_acc_track_mask; + spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); + spte |= saved_bits; + + return spte; +} + u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); void kvm_mmu_reset_all_pte_masks(void); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index caa96c270b95..6d3b3e5a5533 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter) * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { + int root_level = root->role.level; + WARN_ON(root_level < 1); WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; - iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; + iter->as_id = kvm_mmu_page_as_id(root); tdp_iter_restart(iter); } @@ -87,7 +89,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -121,7 +123,7 @@ static bool try_step_side(struct tdp_iter *iter) iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); return true; } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index e19cabbcb65c..b1eaf6ec0e0b 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,7 +7,20 @@ #include "mmu.h" -typedef u64 __rcu *tdp_ptep_t; +/* + * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) + * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be + * batched without having to collect the list of zapped SPs. Flows that can + * remove SPs must service pending TLB flushes prior to dropping RCU protection. + */ +static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) +{ + return READ_ONCE(*rcu_dereference(sptep)); +} +static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val) +{ + WRITE_ONCE(*rcu_dereference(sptep), val); +} /* * A TDP iterator performs a pre-order walk over a TDP paging structure. @@ -57,17 +70,17 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ - for (tdp_iter_start(&iter, root, root_level, min_level, start); \ +#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start); \ iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, root_level, start, end) \ - for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte(iter, root, start, end) \ + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bc9e3553fba2..e7e7876251b3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -25,17 +25,22 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + kvm->arch.tdp_mmu_zap_wq = + alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); return true; } -static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, +/* Arbitrarily returns true so that this may be used in if statements. */ +static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, bool shared) { if (shared) lockdep_assert_held_read(&kvm->mmu_lock); else lockdep_assert_held_write(&kvm->mmu_lock); + + return true; } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) @@ -43,20 +48,20 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; + flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages - * can run before the VM is torn down. + * can run before the VM is torn down. Work items on tdp_mmu_zap_wq + * can call kvm_tdp_mmu_put_root and create new callbacks. */ rcu_barrier(); } -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush, - bool shared); - static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { free_page((unsigned long)sp->spt); @@ -79,6 +84,56 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } +static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); + +static void tdp_mmu_zap_root_work(struct work_struct *work) +{ + struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, + tdp_mmu_async_work); + struct kvm *kvm = root->tdp_mmu_async_data; + + read_lock(&kvm->mmu_lock); + + /* + * A TLB flush is not necessary as KVM performs a local TLB flush when + * allocating a new root (see kvm_mmu_load()), and when migrating vCPU + * to a different pCPU. Note, the local TLB flush on reuse also + * invalidates any paging-structure-cache entries, i.e. TLB entries for + * intermediate paging structures, that may be zapped, as such entries + * are associated with the ASID on both VMX and SVM. + */ + tdp_mmu_zap_root(kvm, root, true); + + /* + * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for + * avoiding an infinite loop. By design, the root is reachable while + * it's being asynchronously zapped, thus a different task can put its + * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an + * asynchronously zapped root is unavoidable. + */ + kvm_tdp_mmu_put_root(kvm, root, true); + + read_unlock(&kvm->mmu_lock); +} + +static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + root->tdp_mmu_async_data = kvm; + INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); + queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); +} + +static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) +{ + union kvm_mmu_page_role role = page->role; + role.invalid = true; + + /* No need to use cmpxchg, only the invalid bit can change. */ + role.word = xchg(&page->role.word, role.word); + return role.invalid; +} + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -89,25 +144,63 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, WARN_ON(!root->tdp_mmu_page); + /* + * The root now has refcount=0. It is valid, but readers already + * cannot acquire a reference to it because kvm_tdp_mmu_get_root() + * rejects it. This remains true for the rest of the execution + * of this function, because readers visit valid roots only + * (except for tdp_mmu_zap_root_work(), which however + * does not acquire any reference itself). + * + * Even though there are flows that need to visit all roots for + * correctness, they all take mmu_lock for write, so they cannot yet + * run concurrently. The same is true after kvm_tdp_root_mark_invalid, + * since the root still has refcount=0. + * + * However, tdp_mmu_zap_root can yield, and writers do not expect to + * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). + * So the root temporarily gets an extra reference, going to refcount=1 + * while staying invalid. Readers still cannot acquire any reference; + * but writers are now allowed to run if tdp_mmu_zap_root yields and + * they might take an extra reference if they themselves yield. + * Therefore, when the reference is given back by the worker, + * there is no guarantee that the refcount is still 1. If not, whoever + * puts the last reference will free the page, but they will not have to + * zap the root because a root cannot go from invalid to valid. + */ + if (!kvm_tdp_root_mark_invalid(root)) { + refcount_set(&root->tdp_mmu_root_count, 1); + + /* + * Zapping the root in a worker is not just "nice to have"; + * it is required because kvm_tdp_mmu_invalidate_all_roots() + * skips already-invalid roots. If kvm_tdp_mmu_put_root() did + * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() + * might return with some roots not zapped yet. + */ + tdp_mmu_schedule_zap_root(kvm, root); + return; + } + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - - zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); - call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } /* - * Finds the next valid root after root (or the first valid root if root - * is NULL), takes a reference on it, and returns that next root. If root - * is not NULL, this thread should have already taken a reference on it, and - * that reference will be dropped. If no valid root is found, this - * function will return NULL. + * Returns the next root after @prev_root (or the first root if @prev_root is + * NULL). A reference to the returned root is acquired, and the reference to + * @prev_root is released (the caller obviously must hold a reference to + * @prev_root if it's non-NULL). + * + * If @only_valid is true, invalid roots are skipped. + * + * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool shared) + bool shared, bool only_valid) { struct kvm_mmu_page *next_root; @@ -121,9 +214,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, typeof(*next_root), link); - while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + while (next_root) { + if ((!only_valid || !next_root->role.invalid) && + kvm_tdp_mmu_get_root(next_root)) + break; + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, &next_root->link, typeof(*next_root), link); + } rcu_read_unlock(); @@ -143,71 +241,91 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * mode. In the unlikely event that this thread must free a root, the lock * will be temporarily dropped and reacquired in write mode. */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ - } else - -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ - lockdep_is_held_type(&kvm->mmu_lock, 0) || \ - lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ + kvm_mmu_page_as_id(_root) != _as_id) { \ } else -static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, - int level) -{ - union kvm_mmu_page_role role; +#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) - role = vcpu->arch.mmu->mmu_role.base; - role.level = level; - role.direct = true; - role.has_4_byte_gpte = false; - role.access = ACC_ALL; - role.ad_disabled = !shadow_accessed_mask; +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) - return role; -} +/* + * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, + * the implication being that any flow that holds mmu_lock for read is + * inherently yield-friendly and should use the yield-safe variant above. + * Holding mmu_lock for write obviates the need for RCU protection as the list + * is guaranteed to be stable. + */ +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ + kvm_mmu_page_as_id(_root) != _as_id) { \ + } else -static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) +static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + + return sp; +} + +static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, + gfn_t gfn, union kvm_mmu_page_role role) +{ set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp->role.word = page_role_for_level(vcpu, level).word; + sp->role = role; sp->gfn = gfn; + sp->ptep = sptep; sp->tdp_mmu_page = true; trace_kvm_mmu_get_page(sp, true); +} - return sp; +static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, + struct tdp_iter *iter) +{ + struct kvm_mmu_page *parent_sp; + union kvm_mmu_page_role role; + + parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); + + role = parent_sp->role; + role.level--; + + tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role; + union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - - /* Check for an existing root before allocating a new one. */ + /* + * Check for an existing root before allocating a new one. Note, the + * role check prevents consuming an invalid root. + */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(kvm, root)) + kvm_tdp_mmu_get_root(root)) goto out; } - root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_sp(root, NULL, 0, role); + refcount_set(&root->tdp_mmu_root_count, 1); spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -252,25 +370,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } /** - * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU - * - * @kvm: kvm instance - * @sp: the new page - * @account_nx: This page replaces a NX large page and should be marked for - * eventual reclaim. - */ -static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool account_nx) -{ - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -} - -/** - * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages * * @kvm: kvm instance * @sp: the page to be removed @@ -278,8 +378,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, * the MMU lock and the operation must synchronize with other * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared) +static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) { if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -295,7 +395,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, } /** - * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * handle_removed_pt() - handle a page table removed from the TDP structure * * @kvm: kvm instance * @pt: the page removed from the paging structure @@ -311,8 +411,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * this thread will be responsible for ensuring the page is freed. Hence the * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, - bool shared) +static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; @@ -321,7 +420,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_page(kvm, sp, shared); + tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { u64 *sptep = rcu_dereference(pt) + i; @@ -372,9 +471,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, base_gfn, - KVM_PAGES_PER_HPAGE(level + 1)); - call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -435,6 +531,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + if (is_leaf) + check_spte_writable_invariants(new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -469,11 +568,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, /* * Recursively handle child PTs if the change removed a subtree from - * the paging structure. + * the paging structure. Note the WARN on the PFN changing without the + * SPTE being converted to a hugepage (leaf) or being zapped. Shadow + * pages are kernel allocations and should never be migrated. */ - if (was_present && !was_leaf && (pfn_changed || !is_present)) - handle_removed_tdp_mmu_page(kvm, - spte_to_child_pt(old_spte, level), shared); + if (was_present && !was_leaf && + (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) + handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -492,53 +593,72 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * + * If setting the SPTE fails because it has changed, iter->old_spte will be + * refreshed to the current value of the spte. + * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @new_spte: The value the SPTE should be set to - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. + * Return: + * * 0 - If the SPTE was set. + * * -EBUSY - If the SPTE cannot be set. In this case this function will have + * no side-effects other than setting iter->old_spte to the last + * known value of the spte. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { - WARN_ON_ONCE(iter->yielded); - - lockdep_assert_held_read(&kvm->mmu_lock); + u64 *sptep = rcu_dereference(iter->sptep); + u64 old_spte; /* - * Do not change removed SPTEs. Only the thread that froze the SPTE - * may modify it. + * The caller is responsible for ensuring the old SPTE is not a REMOVED + * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, + * and pre-checking before inserting a new SPTE is advantageous as it + * avoids unnecessary work. */ - if (is_removed_spte(iter->old_spte)) - return false; + WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + + lockdep_assert_held_read(&kvm->mmu_lock); /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, - new_spte) != iter->old_spte) - return false; + old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); + if (old_spte != iter->old_spte) { + /* + * The page table entry was modified by a different logical + * CPU. Refresh iter->old_spte with the current value so the + * caller operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic(). + */ + iter->old_spte = old_spte; + return -EBUSY; + } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); - return true; + return 0; } -static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { + int ret; + /* * Freeze the SPTE by setting it to a special, * non-present value. This will stop other threads from * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) - return false; + ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + if (ret) + return ret; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, KVM_PAGES_PER_HPAGE(iter->level)); @@ -551,17 +671,21 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*rcu_dereference(iter->sptep), 0); + kvm_tdp_mmu_write_spte(iter->sptep, 0); - return true; + return 0; } /* * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping - * @kvm: kvm instance - * @iter: a tdp_iter instance currently on the SPTE that should be set - * @new_spte: The value the SPTE should be set to + * @kvm: KVM instance + * @as_id: Address space ID, i.e. regular vs. SMM + * @sptep: Pointer to the SPTE + * @old_spte: The current value of the SPTE + * @new_spte: The new value that will be set for the SPTE + * @gfn: The base GFN that was (or will be) mapped by the SPTE + * @level: The level _containing_ the SPTE (its parent PT's level) * @record_acc_track: Notify the MM subsystem of changes to the accessed state * of the page. Should be set unless handling an MMU * notifier for access tracking. Leaving record_acc_track @@ -573,58 +697,65 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * Leaving record_dirty_log unset in that case prevents page * writes from being double counted. */ -static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track, - bool record_dirty_log) +static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level, + bool record_acc_track, bool record_dirty_log) { - WARN_ON_ONCE(iter->yielded); - lockdep_assert_held_write(&kvm->mmu_lock); /* - * No thread should be using this function to set SPTEs to the + * No thread should be using this function to set SPTEs to or from the * temporary removed SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(is_removed_spte(iter->old_spte)); + WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); - WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); + kvm_tdp_mmu_write_spte(sptep, new_spte); + + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); - __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, false); if (record_acc_track) - handle_changed_spte_acc_track(iter->old_spte, new_spte, - iter->level); + handle_changed_spte_acc_track(old_spte, new_spte, level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, - iter->level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} + +static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +{ + WARN_ON_ONCE(iter->yielded); + + __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, + new_spte, iter->gfn, iter->level, + record_acc_track, record_dirty_log); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); + _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); } static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); + _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); } static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); + _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + for_each_tdp_pte(_iter, _root, _start, _end) #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -634,8 +765,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ - _mmu->shadow_root_level, _start, _end) + for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control @@ -662,11 +792,11 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, return false; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - rcu_read_unlock(); - if (flush) kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + if (shared) cond_resched_rwlock_read(&kvm->mmu_lock); else @@ -682,6 +812,99 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, return iter->yielded; } +static inline gfn_t tdp_mmu_max_gfn_host(void) +{ + /* + * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that + * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, + * and so KVM will never install a SPTE for such addresses. + */ + return 1ULL << (shadow_phys_bits - PAGE_SHIFT); +} + +static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared, int zap_level) +{ + struct tdp_iter iter; + + gfn_t end = tdp_mmu_max_gfn_host(); + gfn_t start = 0; + + for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + if (iter.level > zap_level) + continue; + + if (!shared) + tdp_mmu_set_spte(kvm, &iter, 0); + else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) + goto retry; + } +} + +static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) +{ + + /* + * The root must have an elevated refcount so that it's reachable via + * mmu_notifier callbacks, which allows this path to yield and drop + * mmu_lock. When handling an unmap/release mmu_notifier command, KVM + * must drop all references to relevant pages prior to completing the + * callback. Dropping mmu_lock with an unreachable root would result + * in zapping SPTEs after a relevant mmu_notifier callback completes + * and lead to use-after-free as zapping a SPTE triggers "writeback" of + * dirty accessed bits to the SPTE's associated struct page. + */ + WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); + + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + rcu_read_lock(); + + /* + * To avoid RCU stalls due to recursively removing huge swaths of SPs, + * split the zap into two passes. On the first pass, zap at the 1gb + * level, and then zap top-level SPs on the second pass. "1gb" is not + * arbitrary, as KVM must be able to zap a 1gb shadow page without + * inducing a stall to allow in-place replacement with a 1gb hugepage. + * + * Because zapping a SP recurses on its children, stepping down to + * PG_LEVEL_4K in the iterator itself is unnecessary. + */ + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); + __tdp_mmu_zap_root(kvm, root, shared, root->role.level); + + rcu_read_unlock(); +} + +bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + u64 old_spte; + + /* + * This helper intentionally doesn't allow zapping a root shadow page, + * which doesn't have a parent page table and thus no associated entry. + */ + if (WARN_ON_ONCE(!sp->ptep)) + return false; + + old_spte = kvm_tdp_mmu_read_spte(sp->ptep); + if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) + return false; + + __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, + sp->gfn, sp->role.level + 1, true, true); + + return true; +} + /* * Tears down the mappings for the range of gfns, [start, end), and frees the * non-root pages mapping GFNs strictly within that range. Returns true if @@ -693,18 +916,11 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the * operation can cause a soft lockup. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU lock in write mode. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t start, gfn_t end, bool can_yield, bool flush) { - gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - bool zap_all = (start == 0 && end >= max_gfn_host); + bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host()); struct tdp_iter iter; /* @@ -713,22 +929,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, */ int min_level = zap_all ? root->role.level : PG_LEVEL_4K; - /* - * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will - * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, - * and so KVM will never install a SPTE for such addresses. - */ - end = min(end, max_gfn_host); + end = min(end, tdp_mmu_max_gfn_host()); - kvm_lockdep_assert_mmu_lock_held(kvm, shared); + lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { -retry: + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { + tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; continue; } @@ -747,21 +956,21 @@ retry: !is_last_spte(iter.old_spte, iter.level)) continue; - if (!shared) { - tdp_mmu_set_spte(kvm, &iter, 0); - flush = true; - } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); - goto retry; - } + tdp_mmu_set_spte(kvm, &iter, 0); + flush = true; } + /* + * Need to flush before releasing RCU. TODO: do it only if intermediate + * page tables were zapped; there is no need to flush under RCU protection + * if no 'struct kvm_mmu_page' is freed. + */ + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, start, end - start); + rcu_read_unlock(); - return flush; + + return false; } /* @@ -775,107 +984,57 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - false); + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } void kvm_tdp_mmu_zap_all(struct kvm *kvm) { - bool flush = false; + struct kvm_mmu_page *root; int i; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); - - if (flush) - kvm_flush_remote_tlbs(kvm); -} - -static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, - struct kvm_mmu_page *prev_root) -{ - struct kvm_mmu_page *next_root; - - if (prev_root) - next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, - &prev_root->link, - typeof(*prev_root), link); - else - next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, - typeof(*next_root), link); - - while (next_root && !(next_root->role.invalid && - refcount_read(&next_root->tdp_mmu_root_count))) - next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, - &next_root->link, - typeof(*next_root), link); - - return next_root; + /* + * Zap all roots, including invalid roots, as all SPTEs must be dropped + * before returning to the caller. Zap directly even if the root is + * also being zapped by a worker. Walking zapped top-level SPTEs isn't + * all that expensive and mmu_lock is already held, which means the + * worker has yielded, i.e. flushing the work instead of zapping here + * isn't guaranteed to be any faster. + * + * A TLB flush is unnecessary, KVM zaps everything if and only the VM + * is being destroyed or the userspace VMM has exited. In both cases, + * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. + */ + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for_each_tdp_mmu_root_yield_safe(kvm, root, i) + tdp_mmu_zap_root(kvm, root, false); + } } /* - * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each - * invalidated root, they will not be freed until this function drops the - * reference. Before dropping that reference, tear down the paging - * structure so that whichever thread does drop the last reference - * only has to do a trivial amount of work. Since the roots are invalid, - * no new SPTEs should be created under them. + * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast + * zap" completes. */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - struct kvm_mmu_page *next_root; - struct kvm_mmu_page *root; - bool flush = false; - - lockdep_assert_held_read(&kvm->mmu_lock); - - rcu_read_lock(); - - root = next_invalidated_root(kvm, NULL); - - while (root) { - next_root = next_invalidated_root(kvm, root); - - rcu_read_unlock(); - - flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); - - /* - * Put the reference acquired in - * kvm_tdp_mmu_invalidate_roots - */ - kvm_tdp_mmu_put_root(kvm, root, true); - - root = next_root; - - rcu_read_lock(); - } - - rcu_read_unlock(); - - if (flush) - kvm_flush_remote_tlbs(kvm); + flush_workqueue(kvm->arch.tdp_mmu_zap_wq); } /* - * Mark each TDP MMU root as invalid so that other threads - * will drop their references and allow the root count to - * go to 0. + * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that + * is about to be zapped, e.g. in response to a memslots update. The actual + * zapping is performed asynchronously, so a reference is taken on all roots. + * Using a separate workqueue makes it easy to ensure that the destruction is + * performed before the "fast zap" completes, without keeping a separate list + * of invalidated roots; the list is effectively the list of work items in + * the workqueue. * - * Also take a reference on all roots so that this thread - * can do the bulk of the work required to free the roots - * once they are invalidated. Without this reference, a - * vCPU thread might drop the last reference to a root and - * get stuck with tearing down the entire paging structure. - * - * Roots which have a zero refcount should be skipped as - * they're already being torn down. - * Already invalid roots should be referenced again so that - * they aren't freed before kvm_tdp_mmu_zap_all_fast is - * done with them. + * Get a reference even if the root is already invalid, the asynchronous worker + * assumes it was gifted a reference to the root it processes. Because mmu_lock + * is held for write, it should be impossible to observe a root with zero refcount, + * i.e. the list of roots cannot be stale. * * This has essentially the same effect for the TDP MMU * as updating mmu_valid_gen does for the shadow MMU. @@ -885,9 +1044,13 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) - if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (!root->role.invalid && + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { root->role.invalid = true; + tdp_mmu_schedule_zap_root(kvm, root); + } + } } /* @@ -913,8 +1076,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; + else if (is_shadow_present_pte(iter->old_spte) && + !is_last_spte(iter->old_spte, iter->level)) + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(iter->level + 1)); /* * If the page fault was caused by a write but the page is write @@ -947,6 +1114,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } /* + * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the + * provided page table. + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @sp: The new TDP page table to install. + * @account_nx: True if this page table is being installed to split a + * non-executable huge page. + * @shared: This operation is running under the MMU lock in read mode. + * + * Returns: 0 if the new page table was installed. Non-0 if the page table + * could not be installed (e.g. the atomic compare-exchange failed). + */ +static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool account_nx, + bool shared) +{ + u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); + int ret = 0; + + if (shared) { + ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); + if (ret) + return ret; + } else { + tdp_mmu_set_spte(kvm, iter, spte); + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + + return 0; +} + +/* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ @@ -955,8 +1160,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; - u64 *child_pt; - u64 new_spte; int ret; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -979,7 +1182,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) break; /* @@ -987,10 +1190,13 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); } if (!is_shadow_present_pte(iter.old_spte)) { + bool account_nx = fault->huge_page_disallowed && + fault->req_level >= iter.level; + /* * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table @@ -999,26 +1205,21 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); - child_pt = sp->spt; + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - new_spte = make_nonleaf_spte(child_pt, - !shadow_accessed_mask); - - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, - fault->huge_page_disallowed && - fault->req_level >= iter.level); - - trace_kvm_mmu_get_page(sp, true); - } else { + if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { tdp_mmu_free_sp(sp); break; } } } - if (iter.level != fault->goal_level) { + /* + * Force the guest to retry the access if the upper level SPTEs aren't + * in place, or if the target leaf SPTE is frozen by another CPU. + */ + if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { rcu_read_unlock(); return RET_PF_RETRY; } @@ -1032,13 +1233,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - struct kvm_mmu_page *root; - - for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) - flush = zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); - - return flush; + return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, + range->end, range->may_block, flush); } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, @@ -1052,18 +1248,18 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, struct tdp_iter iter; bool ret = false; - rcu_read_lock(); - /* * Don't support rescheduling, none of the MMU notifiers that funnel * into this helper allow blocking; it'd be dead, wasteful code. */ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) ret |= handler(kvm, &iter, range); - } - rcu_read_unlock(); + rcu_read_unlock(); + } return ret; } @@ -1155,13 +1351,12 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, */ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); - - /* FIXME: return 'flush' instead of flushing here. */ - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); - - return false; + /* + * No need to handle the remote TLB flush under RCU protection, the + * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a + * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). + */ + return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); } /* @@ -1180,8 +1375,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1193,14 +1387,9 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1221,13 +1410,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); return spte_set; } +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +{ + struct kvm_mmu_page *sp; + + gfp |= __GFP_ZERO; + + sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + if (!sp) + return NULL; + + sp->spt = (void *)__get_free_page(gfp); + if (!sp->spt) { + kmem_cache_free(mmu_page_header_cache, sp); + return NULL; + } + + return sp; +} + +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, + struct tdp_iter *iter, + bool shared) +{ + struct kvm_mmu_page *sp; + + /* + * Since we are allocating while under the MMU lock we have to be + * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct + * reclaim and to avoid making any filesystem callbacks (which can end + * up invoking KVM MMU notifiers, resulting in a deadlock). + * + * If this allocation fails we drop the lock and retry with reclaim + * allowed. + */ + sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); + if (sp) + return sp; + + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + iter->yielded = true; + sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + + rcu_read_lock(); + + return sp; +} + +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared) +{ + const u64 huge_spte = iter->old_spte; + const int level = iter->level; + int ret, i; + + tdp_mmu_init_child_sp(sp, iter); + + /* + * No need for atomics when writing to sp->spt since the page table has + * not been linked in yet and thus is not reachable from any other CPU. + */ + for (i = 0; i < PT64_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + + /* + * Replace the huge spte with a pointer to the populated lower level + * page table. Since we are making this change without a TLB flush vCPUs + * will see a mix of the split mappings and the original huge mapping, + * depending on what's currently in their TLB. This is fine from a + * correctness standpoint since the translation will be the same either + * way. + */ + ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + if (ret) + goto out; + + /* + * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we + * are overwriting from the page stats. But we have to manually update + * the page stats with the new present child pages. + */ + kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + +out: + trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); + return ret; +} + +static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *sp = NULL; + struct tdp_iter iter; + int ret = 0; + + rcu_read_lock(); + + /* + * Traverse the page table splitting all huge pages above the target + * level into one lower level. For example, if we encounter a 1GB page + * we split it into 512 2MB pages. + * + * Since the TDP iterator uses a pre-order traversal, we are guaranteed + * to visit an SPTE before ever visiting its children, which means we + * will correctly recursively split huge pages that are more than one + * level above the target level (e.g. splitting a 1GB to 512 2MB pages, + * and then splitting each of those to 512 4KB pages). + */ + for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) + continue; + + if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) + continue; + + if (!sp) { + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + if (!sp) { + ret = -ENOMEM; + trace_kvm_mmu_split_huge_page(iter.gfn, + iter.old_spte, + iter.level, ret); + break; + } + + if (iter.yielded) + continue; + } + + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) + goto retry; + + sp = NULL; + } + + rcu_read_unlock(); + + /* + * It's possible to exit the loop having never used the last sp if, for + * example, a vCPU doing HugePage NX splitting wins the race and + * installs its own sp in place of the last sp we tried to split. + */ + if (sp) + tdp_mmu_free_sp(sp); + + return ret; +} + + +/* + * Try to split all huge pages mapped by the TDP MMU down to the target level. + */ +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *root; + int r = 0; + + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { + r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); + if (r) { + kvm_tdp_mmu_put_root(kvm, root, shared); + break; + } + } +} + /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. @@ -1249,6 +1622,9 @@ retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; + if (!is_shadow_present_pte(iter.old_spte)) + continue; + if (spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -1261,14 +1637,9 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1291,7 +1662,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1392,14 +1763,8 @@ retry: continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) goto retry; - } } rcu_read_unlock(); @@ -1416,7 +1781,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) zap_collapsible_spte_range(kvm, root, slot); } @@ -1436,8 +1801,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3899004a5d91..5e5ef2576c81 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,12 +7,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, - struct kvm_mmu_page *root) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { - if (root->role.invalid) - return false; - return refcount_inc_not_zero(&root->tdp_mmu_root_count); } @@ -26,24 +22,8 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, { return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } -static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); - - /* - * Don't allow yielding, as the caller may have a flush pending. Note, - * if mmu_lock is held for write, zapping will never yield in this case, - * but explicitly disallow it for safety. The TDP MMU does not yield - * until it has made forward progress (steps sideways), and when zapping - * a single shadow page that it's guaranteed to see (thus the mmu_lock - * requirement), its "step sideways" will always step beyond the bounds - * of the shadow page's gfn range and stop iterating before yielding. - */ - lockdep_assert_held_write(&kvm->mmu_lock); - return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false); -} +bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); @@ -71,6 +51,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared); + static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock(); @@ -94,7 +79,7 @@ static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root_hpa; + hpa_t hpa = mmu->root.hpa; if (WARN_ON(!VALID_PAGE(hpa))) return false; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fb3e20791338..b37b353ec086 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -324,18 +324,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* - * AVIC hardware handles the generation of - * IPIs when the specified Message Type is Fixed - * (also known as fixed delivery mode) and - * the Trigger Mode is edge-triggered. The hardware - * also supports self and broadcast delivery modes - * specified via the Destination Shorthand(DSH) - * field of the ICRL. Logical and physical APIC ID - * formats are supported. All other IPI types cause - * a #VMEXIT, which needs to emulated. + * Emulate IPIs that are not handled by AVIC hardware, which + * only virtualizes Fixed, Edge-Triggered INTRs. The exit is + * a trap, e.g. ICR holds the correct value and RIP has been + * advanced, KVM is responsible only for emulating the IPI. + * Sadly, hardware may sometimes leave the BUSY flag set, in + * which case KVM needs to emulate the ICR write as well in + * order to clear the BUSY flag. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + if (icrl & APIC_ICR_BUSY) + kvm_apic_write_nodecode(vcpu, APIC_ICR); + else + kvm_apic_send_ipi(apic, icrl, icrh); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: /* @@ -477,30 +477,28 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) svm->dfr_reg = dfr; } -static int avic_unaccel_trap_write(struct vcpu_svm *svm) +static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = svm->vcpu.arch.apic; - u32 offset = svm->vmcb->control.exit_info_1 & + u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; switch (offset) { case APIC_ID: - if (avic_handle_apic_id_update(&svm->vcpu)) + if (avic_handle_apic_id_update(vcpu)) return 0; break; case APIC_LDR: - if (avic_handle_ldr_update(&svm->vcpu)) + if (avic_handle_ldr_update(vcpu)) return 0; break; case APIC_DFR: - avic_handle_dfr_update(&svm->vcpu); + avic_handle_dfr_update(vcpu); break; default: break; } - kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); - + kvm_apic_write_nodecode(vcpu, offset); return 1; } @@ -550,7 +548,7 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) if (trap) { /* Handling Trap */ WARN_ONCE(!write, "svm: Handling trap read.\n"); - ret = avic_unaccel_trap_write(svm); + ret = avic_unaccel_trap_write(vcpu); } else { /* Handling Fault */ ret = kvm_emulate_instruction(vcpu, 0); @@ -578,7 +576,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -void avic_post_state_restore(struct kvm_vcpu *vcpu) +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; @@ -586,20 +584,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - return; -} - -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ -} - -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ -} - -static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; @@ -631,48 +616,6 @@ out: return ret; } -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb01.ptr; - bool activated = kvm_vcpu_apicv_active(vcpu); - - if (!enable_apicv) - return; - - if (activated) { - /** - * During AVIC temporary deactivation, guest could update - * APIC ID, DFR and LDR registers, which would not be trapped - * by avic_unaccelerated_access_interception(). In this case, - * we need to check and update the AVIC logical APIC ID table - * accordingly before re-activating. - */ - avic_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - } - vmcb_mark_dirty(vmcb, VMCB_AVIC); - - if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); - - svm_set_pi_irte_mode(vcpu, activated); -} - -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - return; -} - -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -770,7 +713,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, } /* - * svm_update_pi_irte - set IRTE for Posted-Interrupts + * avic_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -778,8 +721,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; @@ -879,7 +822,7 @@ out: return ret; } -bool svm_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | @@ -924,20 +867,15 @@ out: return ret; } -void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); lockdep_assert_preemption_disabled(); - /* - * Since the host physical APIC id is 8 bits, - * we can support host APIC ID upto 255. - */ - if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) + if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; /* @@ -961,7 +899,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } -void avic_vcpu_put(struct kvm_vcpu *vcpu) +void __avic_vcpu_put(struct kvm_vcpu *vcpu) { u64 entry; struct vcpu_svm *svm = to_svm(vcpu); @@ -980,13 +918,63 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } +static void avic_vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu = get_cpu(); + + WARN_ON(cpu != vcpu->cpu); + + __avic_vcpu_load(vcpu, cpu); + + put_cpu(); +} + +static void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + __avic_vcpu_put(vcpu); + + preempt_enable(); +} + +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb01.ptr; + bool activated = kvm_vcpu_apicv_active(vcpu); + + if (!enable_apicv) + return; + + if (activated) { + /** + * During AVIC temporary deactivation, guest could update + * APIC ID, DFR and LDR registers, which would not be trapped + * by avic_unaccelerated_access_interception(). In this case, + * we need to check and update the AVIC logical APIC ID table + * accordingly before re-activating. + */ + avic_apicv_post_state_restore(vcpu); + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + } else { + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + } + vmcb_mark_dirty(vmcb, VMCB_AVIC); + + if (activated) + avic_vcpu_load(vcpu); + else + avic_vcpu_put(vcpu); + + avic_set_pi_irte_mode(vcpu, activated); +} + void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_apicv_active(vcpu)) return; - preempt_disable(); - /* * Unload the AVIC when the vCPU is about to block, _before_ * the vCPU actually blocks. @@ -1001,21 +989,12 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) * the cause of errata #1235). */ avic_vcpu_put(vcpu); - - preempt_enable(); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { - int cpu; - if (!kvm_vcpu_apicv_active(vcpu)) return; - cpu = get_cpu(); - WARN_ON(cpu != vcpu->cpu); - - avic_vcpu_load(vcpu, cpu); - - put_cpu(); + avic_vcpu_load(vcpu); } diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h new file mode 100644 index 000000000000..7d6d97968fb9 --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM). + */ + +#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__ +#define __ARCH_X86_KVM_SVM_HYPERV_H__ + +#include <asm/mshyperv.h> + +#include "../hyperv.h" + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + +#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 39d280e7e80e..96bab464967f 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -28,6 +28,7 @@ #include "cpuid.h" #include "lapic.h" #include "svm.h" +#include "hyperv.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } +/* + * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function + * is optimized in that it only merges the parts where KVM MSR permission bitmap + * may contain zero bits. + */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + struct hv_enlightenments *hve = + (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + int i; + /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) is using Hyper-V emulation interface and + * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - int i; + if (!svm->nested.force_msr_bitmap_recalc && + kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; @@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } + svm->nested.force_msr_bitmap_recalc = false; + +set_msrpm_base_pa: svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) } static -void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, +void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *to, struct vmcb_control_area *from) { unsigned int i; @@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, to->asid = from->asid; to->msrpm_base_pa &= ~0x0fffULL; to->iopm_base_pa &= ~0x0fffULL; + + /* Hyper-V extensions (Enlightened VMCB) */ + if (kvm_hv_hypercall_enabled(vcpu)) { + to->clean = from->clean; + memcpy(to->reserved_sw, from->reserved_sw, + sizeof(struct hv_enlightenments)); + } } void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control) { - __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); + __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); } static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, @@ -464,14 +492,14 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; - if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3); - vcpu->arch.cr3 = cr3; /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3); + return 0; } @@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + svm->nested.force_msr_bitmap_recalc = true; } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { @@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; + /* 'clean' and 'reserved_sw' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; @@ -1495,6 +1525,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(ret)) goto out_free; + svm->nested.force_msr_bitmap_recalc = true; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5aa45f13b16d..d4de52409335 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!enable_pmu) + if (!vcpu->kvm->arch.enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 17b53457d866..75fa6dd268f0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free; INIT_LIST_HEAD(&sev->regions_list); + INIT_LIST_HEAD(&sev->mirror_vms); return 0; @@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) } } -static void sev_migrate_from(struct kvm_sev_info *dst, - struct kvm_sev_info *src) +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { + struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *mirror; + dst->active = true; dst->asid = src->asid; dst->handle = src->handle; @@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst, src->enc_context_owner = NULL; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); + + /* + * If this VM has mirrors, "transfer" each mirror's refcount of the + * source to the destination (this KVM). The caller holds a reference + * to the source, so there's no danger of use-after-free. + */ + list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); + list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { + kvm_get_kvm(dst_kvm); + kvm_put_kvm(src_kvm); + mirror->enc_context_owner = dst_kvm; + } + + /* + * If this VM is a mirror, remove the old mirror from the owners list + * and add the new mirror to the list. + */ + if (is_mirroring_enc_context(dst_kvm)) { + struct kvm_sev_info *owner_sev_info = + &to_kvm_svm(dst->enc_context_owner)->sev_info; + + list_del(&src->mirror_entry); + list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); + } } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) @@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) return 0; } -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; @@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) src_sev = &to_kvm_svm(source_kvm)->sev_info; - /* - * VMs mirroring src's encryption context rely on it to keep the - * ASID allocated, but below we are clearing src_sev->asid. - */ - if (src_sev->num_mirrored_vms) { - ret = -EBUSY; - goto out_unlock; - } - dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_source_vcpu; } - sev_migrate_from(dst_sev, src_sev); + + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; @@ -1761,7 +1781,7 @@ out_fput: return ret; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; @@ -1858,8 +1878,8 @@ out: return r; } -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct enc_region *region; @@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct enc_region *region; int ret; @@ -1972,7 +1992,7 @@ failed: return ret; } -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; @@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - source_sev->num_mirrored_vms++; + mirror_sev = &to_kvm_svm(kvm)->sev_info; + list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ - mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; mirror_sev->asid = source_sev->asid; @@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->es_active = source_sev->es_active; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + INIT_LIST_HEAD(&mirror_sev->mirror_vms); ret = 0; /* @@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; - WARN_ON(sev->num_mirrored_vms); - if (!sev_guest(kvm)) return; + WARN_ON(!list_empty(&sev->mirror_vms)); + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; - struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; mutex_lock(&owner_kvm->lock); - if (!WARN_ON(!owner_sev->num_mirrored_vms)) - owner_sev->num_mirrored_vms--; + list_del(&sev->mirror_entry); mutex_unlock(&owner_kvm->lock); kvm_put_kvm(owner_kvm); return; @@ -2173,7 +2192,7 @@ out: #endif } -void sev_hardware_teardown(void) +void sev_hardware_unsetup(void) { if (!sev_enabled) return; @@ -2358,7 +2377,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) +static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu; struct ghcb *ghcb; @@ -2463,7 +2482,7 @@ static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) goto vmgexit_err; } - return true; + return 0; vmgexit_err: vcpu = &svm->vcpu; @@ -2486,7 +2505,8 @@ vmgexit_err: ghcb_set_sw_exit_info_1(ghcb, 2); ghcb_set_sw_exit_info_2(ghcb, reason); - return false; + /* Resume the guest to "return" the error code. */ + return 1; } void sev_es_unmap_ghcb(struct vcpu_svm *svm) @@ -2545,7 +2565,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) -static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; struct ghcb *ghcb = svm->sev_es.ghcb; @@ -2598,14 +2618,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) } scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) - goto e_scratch; + return -ENOMEM; if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); kvfree(scratch_va); - goto e_scratch; + return -EFAULT; } /* @@ -2621,13 +2641,13 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) svm->sev_es.ghcb_sa = scratch_va; svm->sev_es.ghcb_sa_len = len; - return true; + return 0; e_scratch: ghcb_set_sw_exit_info_1(ghcb, 2); ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); - return false; + return 1; } static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, @@ -2765,17 +2785,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) exit_code = ghcb_get_sw_exit_code(ghcb); - if (!sev_es_validate_vmgexit(svm)) - return 1; + ret = sev_es_validate_vmgexit(svm); + if (ret) + return ret; sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); - ret = 1; switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: - if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) break; ret = kvm_sev_es_mmio_read(vcpu, @@ -2784,7 +2805,8 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: - if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) + ret = setup_vmgexit_scratch(svm, false, control->exit_info_2); + if (ret) break; ret = kvm_sev_es_mmio_write(vcpu, @@ -2817,6 +2839,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); } + ret = 1; break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: @@ -2836,6 +2859,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { int count; int bytes; + int r; if (svm->vmcb->control.exit_info_2 > INT_MAX) return -EINVAL; @@ -2844,8 +2868,9 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (unlikely(check_mul_overflow(count, size, &bytes))) return -EINVAL; - if (!setup_vmgexit_scratch(svm, in, bytes)) - return 1; + r = setup_vmgexit_scratch(svm, in, bytes); + if (r) + return r; return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, count, in); @@ -2907,20 +2932,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - struct vmcb_save_area *hostsa; - /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. Since hardware does not - * perform a VMSAVE on VMRUN, the host savearea must be updated. + * of which one step is to perform a VMLOAD. KVM performs the + * corresponding VMSAVE in svm_prepare_guest_switch for both + * traditional and SEV-ES guests. */ - vmsave(__sme_page_pa(sd->save_area)); /* XCR0 is restored on VMEXIT, save the current host value */ - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); /* PKRU is restored on VMEXIT, save the current host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fd3a00c892c7..0884c3414a1b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -263,7 +263,7 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -#define MAX_INST_SIZE 15 +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); static int get_npt_level(void) { @@ -353,7 +353,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -401,7 +401,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - (void)skip_emulated_instruction(vcpu); + (void)svm_skip_emulated_instruction(vcpu); rip = kvm_rip_read(vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -668,6 +668,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write) { + struct vcpu_svm *svm = to_svm(vcpu); u8 bit_read, bit_write; unsigned long tmp; u32 offset; @@ -698,7 +699,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - + svm->nested.force_msr_bitmap_recalc = true; } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -873,11 +874,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void svm_hardware_teardown(void) +static void svm_hardware_unsetup(void) { int cpu; - sev_hardware_teardown(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1175,7 +1176,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } -static int svm_create_vcpu(struct kvm_vcpu *vcpu) +static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; @@ -1246,7 +1247,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); } -static void svm_free_vcpu(struct kvm_vcpu *vcpu) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1265,7 +1266,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); @@ -1280,10 +1281,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ + vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - sev_es_prepare_guest_switch(svm, vcpu->cpu); - } else { - vmsave(__sme_page_pa(sd->save_area)); + struct vmcb_save_area *hostsa; + hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + + sev_es_prepare_switch_to_guest(hostsa); } if (tsc_scaling) { @@ -1315,13 +1318,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) - avic_vcpu_load(vcpu, cpu); + __avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { if (kvm_vcpu_apicv_active(vcpu)) - avic_vcpu_put(vcpu); + __avic_vcpu_put(vcpu); svm_prepare_host_switch(vcpu); @@ -1529,6 +1532,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + svm_get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} + static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1563,7 +1575,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1647,7 +1659,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb_current(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) { @@ -2269,7 +2281,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(vcpu)) + if (!svm_skip_emulated_instruction(vcpu)) return 0; } @@ -3149,7 +3161,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(u64 exit_code) { return (exit_code < ARRAY_SIZE(svm_exit_handlers) && svm_exit_handlers[exit_code]); @@ -3169,7 +3181,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(vcpu, exit_code)) + if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE @@ -3204,7 +3216,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } -static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -3301,7 +3313,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; } -static void svm_set_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3531,17 +3543,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - -void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3915,11 +3917,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4254,7 +4251,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * by 0x400 (matches the offset of 'struct vmcb_save_area' * within 'struct vmcb'). Note: HSAVE area may also be used by * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be + * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), @@ -4529,21 +4526,20 @@ static int svm_vm_init(struct kvm *kvm) static struct kvm_x86_ops svm_x86_ops __initdata = { .name = "kvm_amd", - .hardware_unsetup = svm_hardware_teardown, + .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, + .vcpu_create = svm_vcpu_create, + .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_switch_to_guest, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = avic_vcpu_blocking, @@ -4557,9 +4553,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, - .post_set_cr3 = svm_post_set_cr3, + .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4574,21 +4570,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_current = svm_flush_tlb_current, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb_current, .vcpu_pre_run = svm_vcpu_pre_run, - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_run = svm_vcpu_run, + .handle_exit = svm_handle_exit, + .skip_emulated_instruction = svm_skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, + .inject_irq = svm_inject_irq, + .inject_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4598,18 +4594,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_irr_update = svm_hwapic_irr_update, - .hwapic_isr_update = svm_hwapic_isr_update, - .apicv_post_state_restore = avic_post_state_restore, - - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, + .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, + .apicv_post_state_restore = avic_apicv_post_state_restore, + .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4634,8 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, @@ -4643,12 +4631,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = sev_mem_enc_ioctl, + .mem_enc_register_region = sev_mem_enc_register_region, + .mem_enc_unregister_region = sev_mem_enc_unregister_region, - .vm_copy_enc_context_from = svm_vm_copy_asid_from, - .vm_move_enc_context_from = svm_vm_migrate_from, + .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, + .vm_move_enc_context_from = sev_vm_move_enc_context_from, .can_emulate_instruction = svm_can_emulate_instruction, @@ -4893,7 +4881,7 @@ static __init int svm_hardware_setup(void) return 0; err: - svm_hardware_teardown(); + svm_hardware_unsetup(); return r; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fa98d6844728..e37bb3508cfa 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,7 +79,8 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ - unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ + struct list_head mirror_vms; /* List of VMs mirroring */ + struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; @@ -137,6 +138,8 @@ struct vmcb_ctrl_area_cached { u32 event_inj_err; u64 nested_cr3; u64 virt_ext; + u32 clean; + u8 reserved_sw[32]; }; struct svm_nested_state { @@ -163,6 +166,15 @@ struct svm_nested_state { struct vmcb_save_area_cached save; bool initialized; + + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; }; struct vcpu_sev_es_state { @@ -321,7 +333,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) /* * Only the PDPTRs are loaded on demand into the shadow MMU. All other - * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. * * CR3 might be out of date in the VMCB but it is not marked dirty; instead, * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 @@ -480,7 +492,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); @@ -558,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) + +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -565,18 +587,17 @@ void avic_init_vmcb(struct vcpu_svm *svm); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); -void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void avic_vcpu_put(struct kvm_vcpu *vcpu); -void avic_post_state_restore(struct kvm_vcpu *vcpu); -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void __avic_vcpu_put(struct kvm_vcpu *vcpu); +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +bool avic_check_apicv_inhibit_reasons(ulong bit); +void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); +void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); @@ -590,17 +611,17 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); -void sev_hardware_teardown(void); +void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); @@ -608,7 +629,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 489ca56212c6..e2fc59380465 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -7,35 +7,12 @@ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -#include <asm/mshyperv.h> -#include "hyperv.h" #include "kvm_onhyperv.h" +#include "svm/hyperv.h" static struct kvm_x86_ops svm_x86_ops; -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW - int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 92e6f6702f00..193f5ba930d1 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall, * Tracepoint for hypercall. */ TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt, + __u16 rep_idx, __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( __field( __u16, rep_cnt ) @@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall, __field( __u64, ingpa ) __field( __u64, outgpa ) __field( __u16, code ) + __field( __u16, var_cnt ) __field( bool, fast ) ), @@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->ingpa = ingpa; __entry->outgpa = outgpa; __entry->code = code; + __entry->var_cnt = var_cnt; __entry->fast = fast; ), - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) + __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx, + __entry->ingpa, __entry->outgpa) ); TRACE_EVENT(kvm_hv_hypercall_done, @@ -251,13 +253,13 @@ TRACE_EVENT(kvm_cpuid, * Tracepoint for apic access. */ TRACE_EVENT(kvm_apic, - TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_PROTO(unsigned int rw, unsigned int reg, u64 val), TP_ARGS(rw, reg, val), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, reg ) - __field( unsigned int, val ) + __field( u64, val ) ), TP_fast_assign( @@ -266,7 +268,7 @@ TRACE_EVENT(kvm_apic, __entry->val = val; ), - TP_printk("apic_%s %s = 0x%x", + TP_printk("apic_%s %s = 0x%llx", __entry->rw ? "write" : "read", __print_symbolic(__entry->reg, kvm_trace_symbol_apic), __entry->val) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index dc822a1d403d..f18744f7ff82 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -320,7 +320,7 @@ static void free_nested(struct kvm_vcpu *vcpu) kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); nested_release_evmcs(vcpu); @@ -1125,15 +1125,15 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; } - if (!nested_ept) - kvm_mmu_new_pgd(vcpu, cr3); - vcpu->arch.cr3 = cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); + if (!nested_ept) + kvm_mmu_new_pgd(vcpu, cr3); + return 0; } @@ -4802,7 +4802,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl) { struct vcpu_vmx *vmx; @@ -4810,7 +4811,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (vcpu_has_perf_global_ctrl) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= @@ -5011,7 +5012,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr >> PAGE_SHIFT, vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); vmx->nested.current_vmptr = INVALID_GPA; } @@ -5470,7 +5471,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; - if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, + if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_CURRENT; @@ -5490,7 +5491,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) } if (roots_to_free) - kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); return nested_vmx_succeed(vcpu); } @@ -5579,7 +5580,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. */ if (!enable_ept) - kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); + kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); return nested_vmx_succeed(vcpu); } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b69a80f43b37..c92cea0b8ccc 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9b26596099a1..0684e519181b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -487,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry || !enable_pmu) + if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; @@ -541,7 +541,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_entry_exit_ctls_update(vcpu); + nested_vmx_pmu_refresh(vcpu, + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index aa1fe9085d77..3834bb30ce54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm) } /* - * pi_update_irte - set IRTE for Posted-Interrupts + * vmx_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm) * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set) +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index eb14e76b84ef..9a45d5c9f116 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set); +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b730d799c26e..e8963f5af618 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -645,10 +640,10 @@ static void __loaded_vmcs_clear(void *arg) /* * Ensure all writes to loaded_vmcs, including deleting it from its - * current percpu list, complete before setting loaded_vmcs->vcpu to - * -1, otherwise a different cpu can see vcpu == -1 first and add - * loaded_vmcs to its percpu list before it's deleted from this cpu's - * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). + * current percpu list, complete before setting loaded_vmcs->cpu to + * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first + * and add loaded_vmcs to its percpu list before it's deleted from this + * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); @@ -2334,7 +2329,7 @@ fault: return -EFAULT; } -static int hardware_enable(void) +static int vmx_hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2375,7 +2370,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void hardware_disable(void) +static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); @@ -2950,7 +2945,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 root_hpa = mmu->root_hpa; + u64 root_hpa = mmu->root.hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) @@ -3930,31 +3925,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. + * When the target is not the running vCPU, the following + * possibilities emerge: * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. */ - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + if (vcpu != kvm_get_running_vcpu()) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return; } #endif @@ -5177,7 +5174,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - if (kvm_x86_ops.get_cpl(vcpu) > 0) + if (vmx_get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); @@ -5310,9 +5307,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); - u32 offset = exit_qualification & 0xfff; - /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + /* + * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and + * hardware has done any necessary aliasing, offset adjustments, etc... + * for the access. I.e. the correct value has already been written to + * the vAPIC page for the correct 16-byte chunk. KVM needs only to + * retrieve the register value and emulate the access. + */ + u32 offset = exit_qualification & 0xff0; + kvm_apic_write_nodecode(vcpu, offset); return 1; } @@ -6975,7 +6979,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return vmx_exit_handlers_fastpath(vcpu); } -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +static void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6986,7 +6990,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); } -static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +static int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7348,11 +7352,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + vmx->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); @@ -7691,7 +7695,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void hardware_unsetup(void) +static void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -7714,21 +7718,20 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", - .hardware_unsetup = hardware_unsetup, + .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7756,21 +7759,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, .vcpu_pre_run = vmx_vcpu_pre_run, - .run = vmx_vcpu_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, @@ -7823,8 +7826,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8059,7 +8062,7 @@ static __init int hardware_setup(void) vmx_set_cpu_caps(); r = alloc_kvm_area(); - if (r) + if (r && nested) nested_vmx_hardware_unsetup(); kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); @@ -8139,7 +8142,6 @@ static int __init vmx_init(void) ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= KVM_EVMCS_VERSION) { - int cpu; /* Check that we have assist pages on all online CPUs */ for_each_online_cpu(cpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6db3a506b402..02cf0a7e1d14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -110,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) +#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -126,16 +128,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); struct kvm_x86_ops kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); @@ -194,6 +195,9 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_GPL(enable_pmu); module_param(enable_pmu, bool, 0444); +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -760,7 +764,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, - fault_mmu->root_hpa); + fault_mmu->root.hpa); fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; @@ -853,7 +857,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) * Shadow page roots need to be reconstructed instead. */ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) - kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT); + kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); @@ -869,6 +873,13 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); + + /* + * Clearing CR0.PG is defined to flush the TLB from the guest's + * perspective. + */ + if (!(cr0 & X86_CR0_PG)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) @@ -1067,28 +1078,43 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) + kvm_mmu_reset_context(vcpu); + /* - * If any role bit is changed, the MMU needs to be reset. - * - * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed. * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB * according to the SDM; however, stale prev_roots could be reused * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we - * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it - * is slow, but changing CR4.PCIDE is a rare case. - * - * If CR4.PGE is changed, the guest TLB must be flushed. + * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST + * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, + * so fall through. + */ + if (!tdp_enabled && + (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) + kvm_mmu_unload(vcpu); + + /* + * The TLB has to be flushed for all PCIDs if any of the following + * (architecturally required) changes happen: + * - CR4.PCIDE is changed from 1 to 0 + * - CR4.PGE is toggled * - * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and - * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence - * the usage of "else if". + * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. */ - if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) - kvm_mmu_reset_context(vcpu); - else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE) - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); - else if ((cr4 ^ old_cr4) & X86_CR4_PGE) + if (((cr4 ^ old_cr4) & X86_CR4_PGE) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + + /* + * The TLB has to be flushed for the current PCID if any of the + * following (architecturally required) changes happen: + * - CR4.SMEP is changed from 0 to 1 + * - CR4.PAE is toggled + */ + else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || + ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); @@ -1166,7 +1192,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); } int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) @@ -1656,8 +1682,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return r; } - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) + if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); return 0; @@ -2026,17 +2051,10 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data return 1; if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) { - - data &= ~(1 << 12); - kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); - trace_kvm_apic_write(APIC_ICR, (u32)data); - return 0; - } + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && + ((u32)(data >> 32) != X2APIC_BROADCAST)) + return kvm_x2apic_icr_write(vcpu->arch.apic, data); return 1; } @@ -2413,7 +2431,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) +u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; @@ -2428,7 +2446,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } @@ -2436,7 +2454,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + - kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2639,7 +2657,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -3059,7 +3077,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { @@ -3282,7 +3300,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + static_call(kvm_x86_flush_tlb_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3300,14 +3318,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + static_call(kvm_x86_flush_tlb_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); } /* @@ -3869,7 +3887,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ratio = vcpu->arch.tsc_scaling_ratio; } - msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: @@ -4245,6 +4263,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: r = 1; break; @@ -4286,9 +4305,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); - break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; @@ -4343,7 +4359,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + case KVM_CAP_PMU_CAPABILITY: + r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; + break; } + case KVM_CAP_DISABLE_QUIRKS2: + r = KVM_X86_VALID_QUIRKS; + break; default: break; } @@ -5145,7 +5167,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && kvm->arch.last_tsc_offset == offset); - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); @@ -5890,6 +5912,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS2: + r = -EINVAL; + if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + break; + fallthrough; case KVM_CAP_DISABLE_QUIRKS: kvm->arch.disabled_quirks = cap->args[0]; r = 0; @@ -5990,15 +6017,18 @@ split_irqchip_unlock: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_copy_enc_context_from) - r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_copy_enc_context_from) + break; + + r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_move_enc_context_from) - r = kvm_x86_ops.vm_move_enc_context_from( - kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_move_enc_context_from) + break; + + r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -6014,6 +6044,18 @@ split_irqchip_unlock: kvm->arch.exit_on_emulation_error = cap->args[0]; r = 0; break; + case KVM_CAP_PMU_CAPABILITY: + r = -EINVAL; + if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6473,8 +6515,10 @@ set_pit2_out: break; case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -6485,8 +6529,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6497,8 +6543,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -8426,8 +8474,7 @@ writeback: kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8826,6 +8873,12 @@ int kvm_arch_init(void *opaque) goto out; } + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } + r = -ENOMEM; x86_emulator_cache = kvm_alloc_emulator_cache(); @@ -8985,7 +9038,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, * * @apicid - apicid of vcpu to be kicked. */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { struct kvm_lapic_irq lapic_irq; @@ -9104,7 +9157,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) break; - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_pv_kick_cpu_op(vcpu->kvm, a1); kvm_sched_yield(vcpu, a1); ret = 0; break; @@ -9268,10 +9321,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); can_inject = false; } } @@ -9351,7 +9404,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } @@ -9365,7 +9418,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9693,8 +9746,7 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; old = new = kvm->arch.apicv_inhibit_reasons; @@ -9727,10 +9779,12 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) } else kvm->arch.apicv_inhibit_reasons = new; } -EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + if (!enable_apicv) + return; + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); up_write(&kvm->arch.apicv_update_lock); @@ -9769,11 +9823,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } - static_call(kvm_x86_load_eoi_exitmap)( + static_call_cond(kvm_x86_load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } @@ -9796,10 +9850,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -9844,8 +9895,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } } - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) - kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) @@ -9990,7 +10041,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -10071,7 +10122,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - exit_fastpath = static_call(kvm_x86_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10373,10 +10424,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Exclude PKRU from restore as restored separately in - * kvm_x86_ops.run(). - */ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } @@ -10566,16 +10614,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -11348,15 +11386,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static_call(kvm_x86_update_exception_bitmap)(vcpu); /* - * Reset the MMU context if paging was enabled prior to INIT (which is - * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the - * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be - * checked because it is unconditionally cleared on INIT and all other - * paging related bits are ignored if paging is disabled, i.e. CR0.WP, - * CR4, and EFER changes are all irrelevant if CR0.PG was '0'. + * On the standard CR0/CR4/EFER modification paths, there are several + * complex conditions determining whether the MMU has to be reset and/or + * which PCIDs have to be flushed. However, CR0.WP and the paging-related + * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush + * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as + * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here. */ - if (old_cr0 & X86_CR0_PG) + if (old_cr0 & X86_CR0_PG) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); kvm_mmu_reset_context(vcpu); + } /* * Intel's SDM states that all TLB entries are flushed on INIT. AMD's @@ -11614,6 +11654,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; + kvm->arch.enable_pmu = enable_pmu; #if IS_ENABLED(CONFIG_HYPERV) spin_lock_init(&kvm->arch.hv_root_tdp_lock); @@ -11811,7 +11852,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) if (slot->arch.rmap[i]) continue; - slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { memslot_rmap_free(slot); return -ENOMEM; @@ -11848,7 +11889,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, lpages = __kvm_mmu_slot_lpages(slot, npages, level); - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); + linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -11998,6 +12039,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return; + if (READ_ONCE(eager_page_split)) + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); + if (kvm_x86_ops.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); @@ -12042,8 +12086,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -12296,14 +12339,28 @@ static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + + if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (!kvm_pv_async_pf_enabled(vcpu) || - (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0)) + if (vcpu->arch.apf.send_user_only && + static_call(kvm_x86_get_cpl)(vcpu) == 0) return false; - return true; + if (is_guest_mode(vcpu)) { + /* + * L1 needs to opt into the special #PF vmexits that are + * used to deliver async page faults. + */ + return vcpu->arch.apf.delivery_as_pf_vmexit; + } else { + /* + * Play it safe in case the guest temporarily disables paging. + * The real mode IDT in particular is unlikely to have a #PF + * exception setup. + */ + return is_paging(vcpu); + } } bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) @@ -12398,7 +12455,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -12446,7 +12503,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -12471,7 +12528,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -12482,7 +12539,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f11d945ac41f..588792f00334 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -302,6 +302,8 @@ extern int pi_inject_timer; extern bool report_ignored_msrs; +extern bool eager_page_split; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 74be1fda58e3..4aa0f2b31665 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -732,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -867,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; - vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu); + vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; |