summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/cpuid.c81
-rw-r--r--arch/x86/kvm/emulate.c119
-rw-r--r--arch/x86/kvm/hyperv.c264
-rw-r--r--arch/x86/kvm/hyperv.h6
-rw-r--r--arch/x86/kvm/i8254.c6
-rw-r--r--arch/x86/kvm/i8259.c8
-rw-r--r--arch/x86/kvm/ioapic.c6
-rw-r--r--arch/x86/kvm/kvm_emulate.h9
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c14
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h14
-rw-r--r--arch/x86/kvm/lapic.c248
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.h76
-rw-r--r--arch/x86/kvm/mmu/mmu.c544
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c303
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h15
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h23
-rw-r--r--arch/x86/kvm/mmu/page_track.c7
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/mmu/spte.c72
-rw-r--r--arch/x86/kvm/mmu/spte.h129
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c14
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h25
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c1022
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h43
-rw-r--r--arch/x86/kvm/pmu.c23
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c286
-rw-r--r--arch/x86/kvm/svm/hyperv.h35
-rw-r--r--arch/x86/kvm/svm/nested.c77
-rw-r--r--arch/x86/kvm/svm/pmu.c11
-rw-r--r--arch/x86/kvm/svm/sev.c121
-rw-r--r--arch/x86/kvm/svm/svm.c281
-rw-r--r--arch/x86/kvm/svm/svm.h75
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c1
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h25
-rw-r--r--arch/x86/kvm/trace.h42
-rw-r--r--arch/x86/kvm/vmx/nested.c32
-rw-r--r--arch/x86/kvm/vmx/nested.h3
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c23
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c192
-rw-r--r--arch/x86/kvm/vmx/vmx.h5
-rw-r--r--arch/x86/kvm/x86.c519
-rw-r--r--arch/x86/kvm/x86.h54
-rw-r--r--arch/x86/kvm/xen.c108
48 files changed, 2884 insertions, 2199 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2b1548da00eb..e3cbd7706136 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -126,13 +126,6 @@ config KVM_XEN
If in doubt, say "N".
-config KVM_MMU_AUDIT
- bool "Audit KVM MMU"
- depends on KVM && TRACEPOINTS
- help
- This option adds a R/W kVM module parameter 'mmu_audit', which allows
- auditing of KVM MMU events at runtime.
-
config KVM_EXTERNAL_WRITE_TRACKING
bool
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 28be02adc669..b24ca7f4ed7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -282,6 +282,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
if (best && apic) {
@@ -293,9 +294,11 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- vcpu->arch.guest_supported_xcr0 =
+ guest_supported_xcr0 =
cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+ vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
+
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -554,12 +557,13 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
- F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
- F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
- F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
- );
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
+ F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
+ F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
+ F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
+ F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
+ F(AVX512VL));
kvm_cpu_cap_mask(CPUID_7_ECX,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
@@ -711,9 +715,31 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
entry = &array->entries[array->nent++];
+ memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
- entry->flags = 0;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+ break;
+
+ default:
+ break;
+ }
cpuid_count(entry->function, entry->index,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
@@ -875,7 +901,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
eax.split.bit_width = cap.bit_width_gp;
eax.split.mask_length = cap.events_mask_len;
- edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
+ edx.split.num_counters_fixed =
+ min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED);
edx.split.bit_width_fixed = cap.bit_width_fixed;
if (cap.version)
edx.split.anythread_deprecated = 1;
@@ -1056,7 +1083,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
break;
case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001f);
+ entry->eax = min(entry->eax, 0x80000021);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways,
+ * and NullSegClearsBase is not reported in CPUID on Zen2;
+ * help userspace by providing the CPUID leaf ourselves.
+ */
+ if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+ entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
@@ -1127,6 +1162,27 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx &= ~GENMASK(11, 6);
}
break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->ecx = entry->edx = 0;
+ /*
+ * Pass down these bits:
+ * EAX 0 NNDBP, Processor ignores nested data breakpoints
+ * EAX 2 LAS, LFENCE always serializing
+ * EAX 6 NSCB, Null selector clear base
+ *
+ * Other defined bits are for MSRs that KVM does not expose:
+ * EAX 3 SPCL, SMM page configuration lock
+ * EAX 13 PCMSR, Prefetch control MSR
+ */
+ entry->eax &= BIT(0) | BIT(2) | BIT(6);
+ if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ entry->eax |= BIT(2);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ entry->eax |= BIT(6);
+ break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
/*Just support up to 0xC0000004 now*/
@@ -1236,8 +1292,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
if (sanity_check_entries(entries, cpuid->nent, type))
return -EINVAL;
- array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
- cpuid->nent));
+ array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL);
if (!array.entries)
return -ENOMEM;
@@ -1255,7 +1310,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
r = -EFAULT;
out_free:
- vfree(array.entries);
+ kvfree(array.entries);
return r;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5719d8cfdbd9..89b11e7dca8a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
#include <linux/stringify.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
#include "x86.h"
#include "tss.h"
@@ -189,7 +190,7 @@
#define X16(x...) X8(x), X8(x)
#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
-#define FASTOP_SIZE 8
+#define FASTOP_SIZE (8 * (1 + HAS_KERNEL_IBT))
struct opcode {
u64 flags;
@@ -311,7 +312,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
".type " name ", @function \n\t" \
- name ":\n\t"
+ name ":\n\t" \
+ ASM_ENDBR
#define FOP_FUNC(name) \
__FOP_FUNC(#name)
@@ -429,10 +431,27 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
FOP_END
/* Special case for SETcc - 1 instruction per cc */
+
+/*
+ * Depending on .config the SETcc functions look like:
+ *
+ * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT]
+ * SETcc %al [3 bytes]
+ * RET [1 byte]
+ * INT3 [1 byte; CONFIG_SLS]
+ *
+ * Which gives possible sizes 4, 5, 8 or 9. When rounded up to the
+ * next power-of-two alignment they become 4, 8 or 16 resp.
+ */
+#define SETCC_LENGTH (ENDBR_INSN_SIZE + 4 + IS_ENABLED(CONFIG_SLS))
+#define SETCC_ALIGN (4 << IS_ENABLED(CONFIG_SLS) << HAS_KERNEL_IBT)
+static_assert(SETCC_LENGTH <= SETCC_ALIGN);
+
#define FOP_SETCC(op) \
- ".align 4 \n\t" \
+ ".align " __stringify(SETCC_ALIGN) " \n\t" \
".type " #op ", @function \n\t" \
#op ": \n\t" \
+ ASM_ENDBR \
#op " %al \n\t" \
__FOP_RET(#op)
@@ -665,7 +684,7 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
static inline bool emul_is_noncanonical_address(u64 la,
struct x86_emulate_ctxt *ctxt)
{
- return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+ return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt));
}
/*
@@ -715,7 +734,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
case X86EMUL_MODE_PROT64:
*linear = la;
va_bits = ctxt_virt_addr_bits(ctxt);
- if (get_canonical(la, va_bits) != la)
+ if (!__is_canonical_address(la, va_bits))
goto bad;
*max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
@@ -1047,7 +1066,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
- void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+ void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf);
flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
asm("push %[flags]; popf; " CALL_NOSPEC
@@ -1608,11 +1627,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1628,14 +1642,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (!(seg_desc.type & 8))
goto exception;
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
goto exception;
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
}
/* in long-mode d/b must be clear if l is set */
if (seg_desc.d && seg_desc.l) {
@@ -1652,6 +1686,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
old_desc = seg_desc;
seg_desc.type |= 2; /* busy */
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
@@ -1676,6 +1714,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
@@ -2201,9 +2244,6 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- /* Outer-privilege level return is not implemented */
- if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
- return X86EMUL_UNHANDLEABLE;
rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
X86_TRANSFER_RET,
&new_desc);
@@ -2600,8 +2640,7 @@ emulate_shutdown:
}
static void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct desc_struct *cs, struct desc_struct *ss)
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
{
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
@@ -2690,7 +2729,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2758,7 +2797,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2795,7 +2834,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
if ((ctxt->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -3013,8 +3052,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_16 tss_seg;
@@ -3152,8 +3190,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return ret;
}
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_32 tss_seg;
@@ -3261,10 +3298,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -3504,8 +3540,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
{
u64 tsc_aux = 0;
- if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -3606,7 +3644,7 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
+ r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
if (r == X86EMUL_IO_NEEDED)
return r;
@@ -3623,7 +3661,7 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
int r;
- r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
+ r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
if (r == X86EMUL_IO_NEEDED)
return r;
@@ -5380,8 +5418,13 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
- memset(&ctxt->rip_relative, 0,
- (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
ctxt->io_read.pos = 0;
ctxt->io_read.end = 0;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6e38a7d22e97..123b677111c5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
if (!!auto_eoi_old == !!auto_eoi_new)
return;
+ if (!enable_apicv)
+ return;
+
down_write(&vcpu->kvm->arch.apicv_update_lock);
if (auto_eoi_new)
@@ -119,9 +122,13 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
else
hv->synic_auto_eoi_used--;
- __kvm_request_apicv_update(vcpu->kvm,
- !hv->synic_auto_eoi_used,
- APICV_INHIBIT_REASON_HYPERV);
+ /*
+ * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on
+ * the hypervisor to manually inject IRQs.
+ */
+ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm,
+ APICV_INHIBIT_REASON_HYPERV,
+ !!hv->synic_auto_eoi_used);
up_write(&vcpu->kvm->arch.apicv_update_lock);
}
@@ -236,7 +243,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int ret;
- if (!synic->active && !host)
+ if (!synic->active && (!host || data))
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -282,6 +289,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
case HV_X64_MSR_EOM: {
int i;
+ if (!synic->active)
+ break;
+
for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
kvm_hv_notify_acked_sint(vcpu, i);
break;
@@ -446,6 +456,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
struct kvm_lapic_irq irq;
int ret, vector;
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
if (sint >= ARRAY_SIZE(synic->sint))
return -EINVAL;
@@ -658,7 +671,7 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || config))
return 1;
if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
@@ -687,7 +700,7 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || count))
return 1;
trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
@@ -1710,32 +1723,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline unsigned long *sparse_set_to_vcpu_mask(
- struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
- u64 *vp_bitmap, unsigned long *vcpu_bitmap)
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
struct kvm_vcpu *vcpu;
int bank, sbank = 0;
unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
+
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
- memset(vp_bitmap, 0,
- KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
- vp_bitmap[bank] = sparse_banks[sbank++];
+ bitmap[bank] = sparse_banks[sbank++];
- if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
- /* for all vcpus vp_index == vcpu_idx */
- return (unsigned long *)vp_bitmap;
- }
+ if (likely(!has_mismatch))
+ return;
- bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
- __set_bit(i, vcpu_bitmap);
+ __set_bit(i, vcpu_mask);
}
- return vcpu_bitmap;
}
struct kvm_hv_hcall {
@@ -1743,6 +1771,7 @@ struct kvm_hv_hcall {
u64 ingpa;
u64 outgpa;
u16 code;
+ u16 var_cnt;
u16 rep_cnt;
u16 rep_idx;
bool fast;
@@ -1750,22 +1779,60 @@ struct kvm_hv_hcall {
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
};
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ int consumed_xmm_halves,
+ u64 *sparse_banks, gpa_t offset)
{
+ u16 var_cnt;
int i;
- gpa_t gpa;
+
+ if (hc->var_cnt > 64)
+ return -EINVAL;
+
+ /* Ignore banks that cannot possibly contain a legal VP index. */
+ var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ for (i = 0; i < var_cnt; i++) {
+ int j = i + consumed_xmm_halves;
+ if (j % 2)
+ sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
+ var_cnt * sizeof(*sparse_banks));
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
bool all_cpus;
- if (!ex) {
+ /*
+ * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
+ * valid mask is a u64. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) would exceed this limit, KVM will additional changes
+ * for Hyper-V support to avoid setting the guest up to fail.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
if (hc->fast) {
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
@@ -1812,30 +1879,22 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64);
+ if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto do_flush;
- if (!sparse_banks_len && !all_cpus)
+ if (!hc->var_cnt)
goto ret_success;
- if (!all_cpus) {
- if (hc->fast) {
- if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1)
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < sparse_banks_len; i += 2) {
- sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]);
- sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]);
- }
- } else {
- gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents);
- if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks,
- sparse_banks_len *
- sizeof(sparse_banks[0]))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
- }
- }
+ if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
+ offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents)))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
@@ -1843,11 +1902,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
if (all_cpus) {
kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
} else {
- vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
- vcpu_mask);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
}
ret_success:
@@ -1875,21 +1932,18 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
unsigned long valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
- if (!ex) {
+ if (hc->code == HVCALL_SEND_IPI) {
if (!hc->fast) {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
sizeof(send_ipi))))
@@ -1908,9 +1962,15 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
} else {
- if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
- sizeof(send_ipi_ex))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
send_ipi_ex.vp_set.format,
@@ -1918,22 +1978,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
vector = send_ipi_ex.vector;
valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
- sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
-
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
if (all_cpus)
goto check_and_send_ipi;
- if (!sparse_banks_len)
+ if (!hc->var_cnt)
goto ret_success;
- if (kvm_read_guest(kvm,
- hc->ingpa + offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
+ if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
+ offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
@@ -1941,11 +1999,13 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ if (all_cpus) {
+ kvm_send_ipi_to_many(kvm, vector, NULL);
+ } else {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ }
ret_success:
return HV_STATUS_SUCCESS;
@@ -2017,11 +2077,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
return ret;
}
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
-}
-
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
@@ -2096,6 +2151,7 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
return true;
}
@@ -2191,19 +2247,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx,
- hc.ingpa, hc.outgpa);
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
ret = HV_STATUS_ACCESS_DENIED;
goto hypercall_complete;
}
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
if (hc.fast && is_xmm_fast_hypercall(&hc)) {
if (unlikely(hv_vcpu->enforce_cpuid &&
!(hv_vcpu->cpuid_cache.features_edx &
@@ -2217,14 +2279,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2234,7 +2296,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) {
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2247,46 +2309,43 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_complete_userspace;
return 0;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
- if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, false);
- break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
- if (unlikely(hc.rep)) {
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, false);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
- if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, true);
- break;
+ fallthrough;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, true);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
case HVCALL_SEND_IPI:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, &hc, false);
- break;
+ fallthrough;
case HVCALL_SEND_IPI_EX:
- if (unlikely(hc.fast || hc.rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, &hc, true);
+ ret = kvm_hv_send_ipi(vcpu, &hc);
break;
case HVCALL_POST_DEBUG_DATA:
case HVCALL_RETRIEVE_DEBUG_DATA:
@@ -2417,10 +2476,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
if (kvm_x86_ops.nested_ops->get_evmcs_version)
evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
- /* Skip NESTED_FEATURES if eVMCS is not supported */
- if (!evmcs_ver)
- --nent;
-
if (cpuid->nent < nent)
return -E2BIG;
@@ -2520,8 +2575,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
- if (evmcs_ver)
- ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index ed1c4e546d04..e19c00ee9ab3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0b65a764ed3a..1c83076091af 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -305,15 +305,13 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
* So, deactivate APICv when PIT is in reinject mode.
*/
if (reinject) {
- kvm_request_apicv_update(kvm, false,
- APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
/* The initial state is preserved while ps->reinject == 0. */
kvm_pit_reset_reinject(pit);
kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
} else {
- kvm_request_apicv_update(kvm, true,
- APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 814064d06016..be99dc86293d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
@@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ elcr_ioport_write(&s->pics[addr & 1], data);
pic_unlock(s);
break;
default:
@@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
pic_unlock(s);
break;
default:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index decfa36b7891..765943d7cfa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
int trigger_mode,
int pin);
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
- unsigned long addr,
- unsigned long length)
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
{
unsigned long result = 0;
@@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
break;
case IOAPIC_REG_WINDOW:
- result = ioapic_read_indirect(ioapic, addr, len);
+ result = ioapic_read_indirect(ioapic);
break;
default:
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 39eded2426ff..8dff25d267b7 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -210,6 +210,8 @@ struct x86_emulate_ops {
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+ int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -226,6 +228,7 @@ struct x86_emulate_ops {
bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
@@ -336,11 +339,7 @@ struct x86_emulate_ctxt {
fastop_t fop;
};
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
- /*
- * The following six fields are cleared together,
- * the rest are initialized unconditionally in x86_decode_insn
- * or elsewhere
- */
+
bool rip_relative;
u8 rex_prefix;
u8 lock_prefix;
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index b469f45e3fe4..ee4f696a0782 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm)
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 1c67abf2eba9..287e98ef9df3 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -10,19 +10,7 @@
int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range);
int hv_remote_flush_tlb(struct kvm *kvm);
-
-static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
-{
- struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
-
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
- spin_lock(&kvm_arch->hv_root_tdp_lock);
- vcpu->arch.hv_root_tdp = root_tdp;
- if (root_tdp != kvm_arch->hv_root_tdp)
- kvm_arch->hv_root_tdp = INVALID_PAGE;
- spin_unlock(&kvm_arch->hv_root_tdp_lock);
- }
-}
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4662469240bc..66b0eb0bda94 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,39 @@ static bool lapic_timer_advance_dynamic __read_mostly;
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+ *((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ __kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+ return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
+{
+ __kvm_lapic_set_reg64(apic->regs, reg, val);
+}
+
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -113,7 +146,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
@@ -491,8 +525,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -522,7 +555,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -570,8 +603,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -992,6 +1024,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = -1;
if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
return true;
}
@@ -1096,14 +1132,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
- kvm_lapic_set_irr(vector, apic);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_vcpu_kick(vcpu);
- } else {
- trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector);
- }
+ static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
@@ -1282,6 +1312,9 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
struct kvm_lapic_irq irq;
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
@@ -1298,6 +1331,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
@@ -1381,8 +1415,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
unsigned char alignment = offset & 0xf;
u32 result;
@@ -1400,7 +1434,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
APIC_REG_MASK(APIC_ESR) |
APIC_REG_MASK(APIC_ICR) |
- APIC_REG_MASK(APIC_ICR2) |
APIC_REG_MASK(APIC_LVTT) |
APIC_REG_MASK(APIC_LVTTHMR) |
APIC_REG_MASK(APIC_LVTPC) |
@@ -1411,9 +1444,16 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REG_MASK(APIC_TMCCT) |
APIC_REG_MASK(APIC_TDCR);
- /* ARBPRI is not valid on x2APIC */
+ /*
+ * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
+ * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
+ * manually handled by the caller.
+ */
if (!apic_x2apic_mode(apic))
- valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_ICR2);
+ else
+ WARN_ON_ONCE(offset == APIC_ICR);
if (alignment + len > 4)
return 1;
@@ -1438,7 +1478,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
@@ -1999,7 +2038,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -2058,16 +2097,18 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
/* No delay here, so we always clear the pending bit */
- val &= ~(1 << 12);
+ val &= ~APIC_ICR_BUSY;
kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
-
case APIC_ICR2:
- if (!apic_x2apic_mode(apic))
- val &= 0xff000000;
- kvm_lapic_set_reg(apic, APIC_ICR2, val);
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
case APIC_LVT0:
@@ -2127,10 +2168,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR,
- APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
- } else
+ if (apic_x2apic_mode(apic))
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+ else
ret = 1;
break;
default:
@@ -2138,11 +2178,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
kvm_recalculate_apic_map(apic->vcpu->kvm);
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, const void *data)
@@ -2186,12 +2230,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
- u32 val = 0;
-
- /* hw has done the conditional check and inst decode */
- offset &= 0xff0;
-
- kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+ u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
/* TODO: optimize to just emulate side effect w/o one more write */
kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
@@ -2248,10 +2287,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
@@ -2293,7 +2329,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
+ static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2312,7 +2348,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
apic->irr_pending = true;
apic->isr_count = 1;
} else {
- apic->irr_pending = (apic_search_irr(apic) != -1);
+ /*
+ * Don't clear irr_pending, searching the IRR can race with
+ * updates from the CPU as APICv is still active from hardware's
+ * perspective. The flag will be cleared as appropriate when
+ * KVM injects the interrupt.
+ */
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
}
@@ -2357,8 +2398,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic_x2apic_mode(apic))
kvm_apic_set_ldr(apic, 0);
kvm_lapic_set_reg(apic, APIC_ESR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
kvm_lapic_set_reg(apic, APIC_TDCR, 0);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
for (i = 0; i < 8; i++) {
@@ -2374,9 +2419,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2575,6 +2620,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
if (*id != vcpu->vcpu_id)
@@ -2586,9 +2632,21 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
*id <<= 24;
}
- /* In x2APIC mode, the LDR is fixed and based on the id */
- if (set)
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * ICR is internally a single 64-bit register, but needs to be
+ * split to ICR+ICR2 in userspace for backwards compatibility.
+ */
+ if (set) {
*ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
return 0;
@@ -2639,11 +2697,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -2780,73 +2836,85 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4;
+ data &= ~APIC_ICR_BUSY;
- if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
- return 1;
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
- if (reg == APIC_ICR2)
+ if (reg == APIC_ICR) {
+ *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ return 0;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
return 1;
- /* if this is ICR write vector before command */
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_x2apic_icr_write(apic, data);
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR || reg == APIC_ICR2)
- return 1;
+ return kvm_lapic_msr_write(apic, reg, data);
+}
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
- *data = (((u64)high) << 32) | low;
+ if (reg == APIC_DFR)
+ return 1;
- return 0;
+ return kvm_lapic_msr_read(apic, reg, data);
}
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
if (!lapic_in_kernel(vcpu))
return 1;
- /* if this is ICR write vector before command */
- if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return kvm_lapic_reg_write(apic, reg, (u32)data);
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
}
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 low, high = 0;
-
if (!lapic_in_kernel(vcpu))
return 1;
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
- return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
- *data = (((u64)high) << 32) | low;
-
- return 0;
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
}
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
@@ -2934,7 +3002,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2b44e533fc8d..4e4f8a22754f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -85,9 +85,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
@@ -121,6 +118,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
@@ -153,19 +151,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
{
- return *((u32 *) (apic->regs + reg_off));
+ return *((u32 *) (regs + reg_off));
}
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
-static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
+ return __kvm_lapic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9fbb2c8bbe2..e6cae6f22683 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
@@ -79,12 +80,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
@@ -106,7 +108,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
if (!VALID_PAGE(root_hpa))
return;
@@ -203,44 +205,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Currently, we have two sorts of write-protection, a) the first one
- * write-protects guest page to sync the guest modification, b) another one is
- * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
- * between these two sorts are:
- * 1) the first case clears MMU-writable bit.
- * 2) the first case requires flushing tlb immediately avoiding corrupting
- * shadow page table between all vcpus so it should be in the protection of
- * mmu-lock. And the another case does not need to flush tlb until returning
- * the dirty bitmap to userspace since it only write-protects the page
- * logged in the bitmap, that means the page in the dirty bitmap is not
- * missed, so it can flush tlb out of mmu-lock.
- *
- * So, there is the problem: the first case can meet the corrupted tlb caused
- * by another case which write-protects pages but without flush tlb
- * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose MMU-writable bit
- * is set, it works since another case never touches MMU-writable bit.
- *
- * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with MMU-writable becomes
- * readonly, if that happens, we need to flush tlb. Fortunately,
- * mmu_spte_update() has already handled it perfectly.
- *
- * The rules to use MMU-writable and PT_WRITABLE_MASK:
- * - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check MMU-writable, this is the most
- * case, otherwise
- * - if we fix page fault on the spte or do write-protection by dirty logging,
- * check PT_WRITABLE_MASK.
- *
- * TODO: introduce APIs to split these two cases.
- */
-static inline bool is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
* access rights (in ACC_* format).
@@ -250,27 +214,27 @@ static inline bool is_writable_pte(unsigned long pte)
*/
static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pte_access, unsigned pte_pkey,
- unsigned pfec)
+ u64 access)
{
- int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ /* strip nested paging fault error codes */
+ unsigned int pfec = access;
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
/*
- * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
+ * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
+ * For implicit supervisor accesses, SMAP cannot be overridden.
*
- * If CPL = 3, SMAP applies to all supervisor-mode data accesses
- * (these are implicit supervisor accesses) regardless of the value
- * of EFLAGS.AC.
+ * SMAP works on supervisor accesses only, and not_smap can
+ * be set or not set when user access with neither has any bearing
+ * on the result.
*
- * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving
- * the result in X86_EFLAGS_AC. We then insert it in place of
- * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec,
- * but it will be one in index if SMAP checks are being overridden.
- * It is important to keep this branchless.
+ * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit;
+ * this bit will always be zero in pfec, but it will be one in index
+ * if SMAP checks are being disabled.
*/
- unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
- int index = (pfec >> 1) +
- (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
+ bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
+ int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
bool fault = (mmu->permissions[index] >> pte_access) & 1;
u32 errcode = PFERR_PRESENT_MASK;
@@ -353,12 +317,12 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
struct x86_exception *exception);
static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
- gpa_t gpa, u32 access,
+ gpa_t gpa, u64 access,
struct x86_exception *exception)
{
if (mmu != &vcpu->arch.nested_mmu)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 593093b52395..8f19ea752704 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
-
#ifdef MMU_DEBUG
bool dbg = 0;
module_param(dbg, bool, 0644);
@@ -190,8 +181,6 @@ struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
struct kvm_mmu_role_regs {
const unsigned long cr0;
@@ -529,6 +518,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
WARN_ON(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -548,11 +538,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+ * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
+ * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
+ * spte, even though the writable spte might be cached on a CPU's TLB.
*
* Returns true if the TLB needs to be flushed
*/
@@ -646,24 +634,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Restore an acc-track PTE back to a regular PTE */
-static u64 restore_acc_track_spte(u64 spte)
-{
- u64 new_spte = spte;
- u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
- & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
-
- WARN_ON_ONCE(spte_ad_enabled(spte));
- WARN_ON_ONCE(!is_access_track_spte(spte));
-
- new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
- SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
- new_spte |= saved_bits;
-
- return new_spte;
-}
-
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
@@ -1229,9 +1199,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1311,7 +1280,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ rmap_write_protect(rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1378,6 +1347,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/* Cross two large pages? */
@@ -1410,7 +1382,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ write_protected |= rmap_write_protect(rmap_head, true);
}
}
@@ -1421,7 +1393,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
return write_protected;
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
@@ -1921,13 +1893,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-static void mmu_audit_disable(void) { }
-#endif
-
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
if (sp->role.invalid)
@@ -2024,7 +1989,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu, sp->gfn);
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
if (protected) {
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
@@ -2149,7 +2114,7 @@ trace_get_page:
hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
+ if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
}
trace_kvm_mmu_get_page(sp, true);
@@ -2179,7 +2144,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
* prev_root is currently only used for 64-bit hosts. So only
* the active root_hpa is valid here.
*/
- BUG_ON(root != vcpu->arch.mmu->root_hpa);
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
iterator->shadow_addr
= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
@@ -2193,7 +2158,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, u64 addr)
{
- shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
addr);
}
@@ -2307,7 +2272,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -2345,13 +2310,13 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
struct list_head *invalid_list,
int *nr_zapped)
{
- bool list_unstable;
+ bool list_unstable, zapped_root = false;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(kvm, sp);
+ kvm_mmu_unlink_parents(sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@@ -2387,14 +2352,20 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
* in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
* treats invalid shadow pages as being obsolete.
*/
- if (!is_obsolete_sp(kvm, sp))
- kvm_reload_remote_mmus(kvm);
+ zapped_root = !is_obsolete_sp(kvm, sp);
}
if (sp->lpage_disallowed)
unaccount_huge_nx_page(kvm, sp);
sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
return list_unstable;
}
@@ -2725,8 +2696,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
if (*sptep == spte) {
ret = RET_PF_SPURIOUS;
} else {
- trace_kvm_mmu_set_spte(level, gfn, sptep);
flush |= mmu_spte_update(sptep, spte);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
}
if (wrprot) {
@@ -3239,6 +3210,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ if (WARN_ON(!sp))
+ return;
if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_put_root(kvm, sp, false);
@@ -3249,18 +3222,20 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
}
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
- struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
- bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+ bool free_active_root;
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
- if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
VALID_PAGE(mmu->prev_roots[i].hpa))
@@ -3278,9 +3253,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
+ if (to_shadow_page(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
@@ -3291,8 +3265,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->pae_root[i] = INVALID_PAE_ROOT;
}
}
- mmu->root_hpa = INVALID_PAGE;
- mmu->root_pgd = 0;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
@@ -3300,7 +3274,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
-void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
unsigned long roots_to_free = 0;
hpa_t root_hpa;
@@ -3322,7 +3296,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
@@ -3365,10 +3339,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
if (WARN_ON_ONCE(!mmu->pae_root)) {
r = -EIO;
@@ -3383,15 +3357,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | PT_PRESENT_MASK |
shadow_me_mask;
}
- mmu->root_hpa = __pa(mmu->pae_root);
+ mmu->root.hpa = __pa(mmu->pae_root);
} else {
WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
r = -EIO;
goto out_unlock;
}
- /* root_pgd is ignored for direct MMUs. */
- mmu->root_pgd = 0;
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -3504,7 +3478,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
mmu->shadow_root_level, false);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
goto set_root_pgd;
}
@@ -3554,18 +3528,18 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
- mmu->root_hpa = __pa(mmu->pml5_root);
+ mmu->root.hpa = __pa(mmu->pml5_root);
else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
- mmu->root_hpa = __pa(mmu->pml4_root);
+ mmu->root.hpa = __pa(mmu->pml4_root);
else
- mmu->root_hpa = __pa(mmu->pae_root);
+ mmu->root.hpa = __pa(mmu->pae_root);
set_root_pgd:
- mmu->root_pgd = root_pgd;
+ mmu->root.pgd = root_pgd;
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- return 0;
+ return r;
}
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
@@ -3660,6 +3634,14 @@ static bool is_unsync_root(hpa_t root)
*/
smp_rmb();
sp = to_shadow_page(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
if (sp->unsync || sp->unsync_children)
return true;
@@ -3674,30 +3656,25 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->direct_map)
return;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
+ hpa_t root = vcpu->arch.mmu->root.hpa;
sp = to_shadow_page(root);
if (!is_unsync_root(root))
return;
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-
mmu_sync_children(vcpu, sp, true);
-
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
return;
}
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
@@ -3709,7 +3686,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -3723,11 +3699,11 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
/* sync prev_roots by simply freeing them */
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gpa_t vaddr, u32 access,
+ gpa_t vaddr, u64 access,
struct x86_exception *exception)
{
if (exception)
@@ -3889,12 +3865,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
gfn_t gfn)
{
struct kvm_arch_async_pf arch;
- arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu->direct_map;
arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
@@ -3971,7 +3958,7 @@ out_retry:
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int mmu_seq)
{
- struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
@@ -3985,7 +3972,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
* previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
* to reload even if no vCPU is actively using the root.
*/
- if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
return true;
return fault->slot &&
@@ -4121,74 +4108,105 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ VALID_PAGE(root->hpa) &&
role.word == to_shadow_page(root->hpa)->role.word;
}
/*
- * Find out if a previously cached root matching the new pgd/role is available.
- * The current root is also inserted into the cache.
- * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
- * returned.
- * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
- * false is returned. This root should now be freed by the caller.
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
*/
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
uint i;
- struct kvm_mmu_root_info root;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
- root.pgd = mmu->root_pgd;
- root.hpa = mmu->root_hpa;
-
- if (is_root_usable(&root, new_pgd, new_role))
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
return true;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
- swap(root, mmu->prev_roots[i]);
-
- if (is_root_usable(&root, new_pgd, new_role))
- break;
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
}
- mmu->root_hpa = root.hpa;
- mmu->root_pgd = root.pgd;
-
- return i < KVM_MMU_NUM_PREV_ROOTS;
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
}
-static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
/*
- * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * For now, limit the caching to 64-bit hosts+VMs in order to avoid
* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
* later if necessary.
*/
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- mmu->root_level >= PT64_ROOT_4LEVEL)
- return cached_root_available(vcpu, new_pgd, new_role);
+ if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
- return false;
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
}
-static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
- if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->mmu_role.base;
+
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
+ /* kvm_mmu_ensure_valid_pgd will set up a new root. */
return;
}
/*
* It's possible that the cached previous root page is obsolete because
* of a change in the MMU generation number. However, changing the
- * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
- * free the root set here and allocate a new one.
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
@@ -4211,12 +4229,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
if (!new_role.direct)
__clear_sp_write_flooding_count(
- to_shadow_page(vcpu->arch.mmu->root_hpa));
-}
-
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
-{
- __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
+ to_shadow_page(vcpu->arch.mmu->root.hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -4474,8 +4487,7 @@ static inline bool boot_cpu_is_amd(void)
* possible, however, kvm currently does not do execution-protection.
*/
static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4506,8 +4518,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
* is the shadow page table for intel nested guest.
*/
static void
-reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
reserved_hpa_bits(), execonly,
@@ -4580,11 +4591,11 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
* - X86_CR4_SMAP is set in CR4
* - A user page is accessed
* - The access is not a fetch
- * - Page fault in kernel mode
- * - if CPL = 3 or X86_EFLAGS_AC is clear
+ * - The access is supervisor mode
+ * - If implicit supervisor access or X86_EFLAGS_AC is clear
*
- * Here, we cover the first three conditions.
- * The fourth is computed dynamically in permission_fault();
+ * Here, we cover the first four conditions.
+ * The fifth is computed dynamically in permission_fault();
* PFERR_RSVD_MASK bit will be set in PFEC if the access is
* *not* subject to SMAP restrictions.
*/
@@ -4794,7 +4805,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
reset_guest_paging_metadata(vcpu, context);
- reset_tdp_shadow_zero_bits_mask(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
}
static union kvm_mmu_role
@@ -4888,9 +4899,8 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
- __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
-
shadow_mmu_init_context(vcpu, context, &regs, new_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4928,27 +4938,25 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
-
- context->mmu_role.as_u64 = new_role.as_u64;
-
- context->shadow_root_level = level;
-
- context->ept_ad = accessed_dirty;
- context->page_fault = ept_page_fault;
- context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
- context->root_level = level;
- context->direct_map = false;
+ if (new_role.as_u64 != context->mmu_role.as_u64) {
+ context->mmu_role.as_u64 = new_role.as_u64;
+
+ context->shadow_root_level = level;
+
+ context->ept_ad = accessed_dirty;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->root_level = level;
+ context->direct_map = false;
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
- update_permission_bitmask(context, true);
- context->pkru_mask = 0;
- reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
- reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+ kvm_mmu_new_pgd(vcpu, new_eptp);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -5033,20 +5041,6 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_init_mmu);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- union kvm_mmu_role role;
-
- if (tdp_enabled)
- role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
- else
- role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
-
- return role.base;
-}
-
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
/*
@@ -5100,17 +5094,73 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
kvm_mmu_load_pgd(vcpu);
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
out:
return r;
}
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT has roots without shadow pages
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ */
+ sp = to_shadow_page(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -5260,7 +5310,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
@@ -5285,7 +5334,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
- kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -5295,7 +5343,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->direct_map;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@@ -5360,14 +5408,14 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
}
if (!mmu->invlpg)
return;
if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
/*
* INVLPG is required to invalidate any global mappings for the VA,
@@ -5403,7 +5451,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
uint i;
if (pcid == kvm_get_active_pcid(vcpu)) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
tlb_flush = true;
}
@@ -5416,7 +5464,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5516,8 +5564,8 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
struct page *page;
int i;
- mmu->root_hpa = INVALID_PAGE;
- mmu->root_pgd = 0;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5637,9 +5685,13 @@ restart:
}
/*
- * Trigger a remote TLB flush before freeing the page tables to ensure
- * KVM is not in the middle of a lockless shadow page table walk, which
- * may reference the pages.
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
@@ -5669,11 +5721,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
- /* In order to ensure all threads see this change when
- * handling the MMU reload signal, this must happen in the
- * same critical section as kvm_reload_remote_mmus, and
- * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
- * could drop the MMU lock and yield.
+ /*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_invalidate_all_roots(kvm);
@@ -5686,17 +5738,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Note: we need to do this under the protection of mmu_lock,
* otherwise, vcpu would purge shadow page but miss tlb flush.
*/
- kvm_reload_remote_mmus(kvm);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
kvm_zap_obsolete_pages(kvm);
write_unlock(&kvm->mmu_lock);
- if (is_tdp_mmu_enabled(kvm)) {
- read_lock(&kvm->mmu_lock);
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_invalidated_roots(kvm);
- read_unlock(&kvm->mmu_lock);
- }
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5711,17 +5768,24 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
kvm_mmu_zap_all_fast(kvm);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ int r;
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- kvm_mmu_init_tdp_mmu(kvm);
+ r = kvm_mmu_init_tdp_mmu(kvm);
+ if (r < 0)
+ return r;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
+ return 0;
}
void kvm_mmu_uninit_vm(struct kvm *kvm)
@@ -5785,8 +5849,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
- gfn_end, flush);
+ flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
+ gfn_end, true, flush);
}
if (flush)
@@ -5802,7 +5866,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return rmap_write_protect(rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -5846,12 +5910,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
- * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
+ * See is_writable_pte() for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+ target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
@@ -6191,7 +6295,6 @@ void kvm_mmu_module_exit(void)
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
}
/*
@@ -6261,6 +6364,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
rcu_idx = srcu_read_lock(&kvm->srcu);
write_lock(&kvm->mmu_lock);
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
@@ -6285,12 +6395,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
cond_resched_rwlock_write(&kvm->mmu_lock);
flush = false;
+
+ rcu_read_lock();
}
}
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
deleted file mode 100644
index 9e7dcf999f08..000000000000
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- * Marcelo Tosatti <mtosatti@redhat.com>
- * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
- */
-
-#include <linux/ratelimit.h>
-
-static char const *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
-
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu->pae_root[i];
-
- if (IS_VALID_PAE_ROOT(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- kvm_pfn_t pfn;
- hpa_t hpa;
-
- sp = sptep_to_sp(sptep);
-
- if (sp->unsync) {
- if (level != PG_LEVEL_4K) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
-
- if (is_error_pfn(pfn))
- return;
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu->root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- struct kvm_rmap_head *rmap_head;
- struct kvm_mmu_page *rev_sp;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- gfn_t gfn;
-
- rev_sp = sptep_to_sp(sptep);
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- if (!slot) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!rmap_head->val) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = sptep_to_sp(sptep);
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PG_LEVEL_4K)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_rmap_head *rmap_head;
- u64 *sptep;
- struct rmap_iterator iter;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (is_writable_pte(*sptep))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static bool mmu_audit;
-static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
-
-static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- if (static_branch_unlikely((&mmu_audit_key)))
- __kvm_mmu_audit(vcpu, point);
-}
-
-static void mmu_audit_enable(void)
-{
- if (mmu_audit)
- return;
-
- static_branch_inc(&mmu_audit_key);
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- static_branch_dec(&mmu_audit_key);
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = kstrtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static const struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index da6166b5c377..1bff453f7cbe 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -30,6 +30,8 @@ extern bool dbg;
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
+typedef u64 __rcu *tdp_ptep_t;
+
struct kvm_mmu_page {
/*
* Note, "link" through "spt" fit in a single 64 byte cache line on
@@ -59,8 +61,17 @@ struct kvm_mmu_page {
refcount_t tdp_mmu_root_count;
};
unsigned int unsync_children;
- struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
- DECLARE_BITMAP(unsync_child_bitmap, 512);
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ union {
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+ struct {
+ struct work_struct tdp_mmu_async_work;
+ void *tdp_mmu_async_data;
+ };
+ };
struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index de5e8e4e1aa7..12247b96af01 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -416,6 +416,29 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 68eb1fb548b6..2e09d1b6249f 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -47,8 +47,8 @@ int kvm_page_track_create_memslot(struct kvm *kvm,
continue;
slot->arch.gfn_track[i] =
- kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL_ACCOUNT);
+ __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
@@ -75,7 +75,8 @@ int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
return 0;
- gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT);
+ gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
+ GFP_KERNEL_ACCOUNT);
if (gfn_track == NULL)
return -ENOMEM;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5b5bdac97c7b..01fee5f67ac3 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -34,9 +34,8 @@
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgq"
#else
- #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
@@ -52,7 +51,7 @@
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgl"
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
#define guest_walker guest_walkerEPT
@@ -65,7 +64,9 @@
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
- #define CMPXCHG cmpxchg64
+ #ifdef CONFIG_X86_64
+ #define CMPXCHG "cmpxchgq"
+ #endif
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#else
#error Invalid PTTYPE value
@@ -147,43 +148,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
{
- int npages;
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
- if (likely(npages == 1)) {
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
-
- kvm_release_page_dirty(page);
- } else {
- struct vm_area_struct *vma;
- unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
- unsigned long pfn;
- unsigned long paddr;
-
- mmap_read_lock(current->mm);
- vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
- if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- paddr = pfn << PAGE_SHIFT;
- table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
- if (!table) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- memunmap(table);
- mmap_read_unlock(current->mm);
- }
+ signed char r;
- return (ret != orig_pte);
+ if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
+ return -EFAULT;
+
+#ifdef CMPXCHG
+ asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
+ "setnz %b[r]\n"
+ "2:"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
+ : [ptr] "+m" (*ptep_user),
+ [old] "+a" (orig_pte),
+ [r] "=q" (r)
+ : [new] "r" (new_pte)
+ : "memory");
+#else
+ asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
+ "setnz %b[r]\n"
+ "2:"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
+ : [ptr] "+m" (*ptep_user),
+ [old] "+A" (orig_pte),
+ [r] "=q" (r)
+ : [new_lo] "b" ((u32)new_pte),
+ [new_hi] "c" ((u32)(new_pte >> 32))
+ : "memory");
+#endif
+
+ user_access_end();
+ return r;
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -339,7 +333,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gpa_t addr, u32 access)
+ gpa_t addr, u64 access)
{
int ret;
pt_element_t pte;
@@ -347,7 +341,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gfn_t table_gfn;
u64 pt_access, pte_access;
unsigned index, accessed_dirty, pte_pkey;
- unsigned nested_access;
+ u64 nested_access;
gpa_t pte_gpa;
bool have_ad;
int offset;
@@ -540,7 +534,7 @@ error:
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
@@ -668,7 +662,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
for (shadow_walk_init(&it, vcpu, fault->addr);
@@ -904,12 +898,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
r = FNAME(fetch)(vcpu, fault, &walker);
- kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
@@ -990,7 +982,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gpa_t addr, u32 access,
+ gpa_t addr, u64 access,
struct x86_exception *exception)
{
struct guest_walker walker;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 73cfe62fdad1..4739b53c9734 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -192,6 +192,65 @@ out:
return wrprot;
}
+static u64 make_spte_executable(u64 spte)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ spte &= ~shadow_nx_mask;
+ spte |= shadow_x_mask;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+{
+ u64 child_spte;
+ int child_level;
+
+ if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+ return 0;
+
+ if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+ return 0;
+
+ child_spte = huge_spte;
+ child_level = huge_level - 1;
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+ if (child_level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page, mark the page executable as the
+ * NX hugepage mitigation no longer applies.
+ */
+ if (is_nx_huge_page_enabled())
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
u64 spte = SPTE_MMU_PRESENT_MASK;
@@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte)
if (is_access_track_spte(spte))
return spte;
- /*
- * Making an Access Tracking PTE will result in removal of write access
- * from the PTE. So, verify that we will be able to restore the write
- * access in the fast page fault path later on.
- */
- WARN_ONCE((spte & PT_WRITABLE_MASK) &&
- !spte_can_locklessly_be_made_writable(spte),
- "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+ check_spte_writable_invariants(spte);
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
@@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_acc_track_mask = 0;
shadow_me_mask = sme_me_mask;
- shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
- shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
/*
* Set a reserved PA bit in MMIO SPTEs to generate page faults with
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index be6a007a4af3..73f12615416f 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
- * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
- * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
- * that map guest pages in read-only memslots and read-only VMAs.
- *
- * Invariants:
- * - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
- *
- *
- * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
- * allows writes to the guest page mapped by the SPTE. This bit is cleared when
- * the guest page mapped by the SPTE contains a page table that is being
- * monitored for shadow paging. In this case the SPTE can only be made writable
- * by unsyncing the shadow page under the mmu_lock.
- *
- * Invariants:
- * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
- * - If MMU-writable is set, Host-writable must be set.
- *
- * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
- * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
- * PT_WRITABLE_MASK upon the next write from the guest and record the write in
- * the dirty log (see fast_page_fault()).
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
@@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
__is_rsvd_bits_set(rsvd_check, spte, level);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+/*
+ * An shadow-present leaf SPTE may be non-writable for 3 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * To keep track of why a given SPTE is write-protected, KVM uses 2
+ * software-only bits in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
{
- if (spte & shadow_mmu_writable_mask) {
- WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
- return true;
- }
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+}
- WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
- return false;
+static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
}
static inline u64 get_mmio_spte_generation(u64 spte)
@@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index caa96c270b95..6d3b3e5a5533 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter)
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
+ int root_level = root->role.level;
+
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
- iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
tdp_iter_restart(iter);
}
@@ -87,7 +89,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -121,7 +123,7 @@ static bool try_step_side(struct tdp_iter *iter)
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
return true;
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index e19cabbcb65c..b1eaf6ec0e0b 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,7 +7,20 @@
#include "mmu.h"
-typedef u64 __rcu *tdp_ptep_t;
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), val);
+}
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
@@ -57,17 +70,17 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \
- for (tdp_iter_start(&iter, root, root_level, min_level, start); \
+#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, root_level, start, end) \
- for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte(iter, root, start, end) \
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bc9e3553fba2..d71d177ae6b8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -14,28 +14,36 @@ static bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
-bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
+ struct workqueue_struct *wq;
+
if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return false;
+ return 0;
+
+ wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
+ if (!wq)
+ return -ENOMEM;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
-
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
-
- return true;
+ kvm->arch.tdp_mmu_zap_wq = wq;
+ return 1;
}
-static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
bool shared)
{
if (shared)
lockdep_assert_held_read(&kvm->mmu_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
}
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
@@ -43,20 +51,20 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
+ flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
- * can run before the VM is torn down.
+ * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
+ * can call kvm_tdp_mmu_put_root and create new callbacks.
*/
rcu_barrier();
}
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush,
- bool shared);
-
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
free_page((unsigned long)sp->spt);
@@ -79,6 +87,56 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+static void tdp_mmu_zap_root_work(struct work_struct *work)
+{
+ struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
+ tdp_mmu_async_work);
+ struct kvm *kvm = root->tdp_mmu_async_data;
+
+ read_lock(&kvm->mmu_lock);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB flush when
+ * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
+ * to a different pCPU. Note, the local TLB flush on reuse also
+ * invalidates any paging-structure-cache entries, i.e. TLB entries for
+ * intermediate paging structures, that may be zapped, as such entries
+ * are associated with the ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
+ * avoiding an infinite loop. By design, the root is reachable while
+ * it's being asynchronously zapped, thus a different task can put its
+ * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
+ * asynchronously zapped root is unavoidable.
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ read_unlock(&kvm->mmu_lock);
+}
+
+static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ root->tdp_mmu_async_data = kvm;
+ INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
+ queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+}
+
+static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
+{
+ union kvm_mmu_page_role role = page->role;
+ role.invalid = true;
+
+ /* No need to use cmpxchg, only the invalid bit can change. */
+ role.word = xchg(&page->role.word, role.word);
+ return role.invalid;
+}
+
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
@@ -89,25 +147,63 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
WARN_ON(!root->tdp_mmu_page);
+ /*
+ * The root now has refcount=0. It is valid, but readers already
+ * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
+ * rejects it. This remains true for the rest of the execution
+ * of this function, because readers visit valid roots only
+ * (except for tdp_mmu_zap_root_work(), which however
+ * does not acquire any reference itself).
+ *
+ * Even though there are flows that need to visit all roots for
+ * correctness, they all take mmu_lock for write, so they cannot yet
+ * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
+ * since the root still has refcount=0.
+ *
+ * However, tdp_mmu_zap_root can yield, and writers do not expect to
+ * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
+ * So the root temporarily gets an extra reference, going to refcount=1
+ * while staying invalid. Readers still cannot acquire any reference;
+ * but writers are now allowed to run if tdp_mmu_zap_root yields and
+ * they might take an extra reference if they themselves yield.
+ * Therefore, when the reference is given back by the worker,
+ * there is no guarantee that the refcount is still 1. If not, whoever
+ * puts the last reference will free the page, but they will not have to
+ * zap the root because a root cannot go from invalid to valid.
+ */
+ if (!kvm_tdp_root_mark_invalid(root)) {
+ refcount_set(&root->tdp_mmu_root_count, 1);
+
+ /*
+ * Zapping the root in a worker is not just "nice to have";
+ * it is required because kvm_tdp_mmu_invalidate_all_roots()
+ * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
+ * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
+ * might return with some roots not zapped yet.
+ */
+ tdp_mmu_schedule_zap_root(kvm, root);
+ return;
+ }
+
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-
- zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
-
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
/*
- * Finds the next valid root after root (or the first valid root if root
- * is NULL), takes a reference on it, and returns that next root. If root
- * is not NULL, this thread should have already taken a reference on it, and
- * that reference will be dropped. If no valid root is found, this
- * function will return NULL.
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL). A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool shared)
+ bool shared, bool only_valid)
{
struct kvm_mmu_page *next_root;
@@ -121,9 +217,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);
- while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ while (next_root) {
+ if ((!only_valid || !next_root->role.invalid) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link, typeof(*next_root), link);
+ }
rcu_read_unlock();
@@ -143,71 +244,91 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* mode. In the unlikely event that this thread must free a root, the lock
* will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
} else
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
- lockdep_is_held_type(&kvm->mmu_lock, 0) || \
- lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
- } else
-
-static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
- int level)
-{
- union kvm_mmu_page_role role;
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
- role = vcpu->arch.mmu->mmu_role.base;
- role.level = level;
- role.direct = true;
- role.has_4_byte_gpte = false;
- role.access = ACC_ALL;
- role.ad_disabled = !shadow_accessed_mask;
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
- return role;
-}
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
-static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- sp->role.word = page_role_for_level(vcpu, level).word;
+ sp->role = role;
sp->gfn = gfn;
+ sp->ptep = sptep;
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
+}
- return sp;
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role;
+ union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
-
- /* Check for an existing root before allocating a new one. */
+ /*
+ * Check for an existing root before allocating a new one. Note, the
+ * role check prevents consuming an invalid root.
+ */
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(kvm, root))
+ kvm_tdp_mmu_get_root(root))
goto out;
}
- root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
refcount_set(&root->tdp_mmu_root_count, 1);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -252,25 +373,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
- * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
- *
- * @kvm: kvm instance
- * @sp: the new page
- * @account_nx: This page replaces a NX large page and should be marked for
- * eventual reclaim.
- */
-static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool account_nx)
-{
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-}
-
-/**
- * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
* @kvm: kvm instance
* @sp: the page to be removed
@@ -278,8 +381,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared)
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
{
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -295,7 +398,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
}
/**
- * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ * handle_removed_pt() - handle a page table removed from the TDP structure
*
* @kvm: kvm instance
* @pt: the page removed from the paging structure
@@ -311,8 +414,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* this thread will be responsible for ensuring the page is freed. Hence the
* early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
- bool shared)
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
@@ -321,7 +423,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
trace_kvm_mmu_prepare_zap_page(sp);
- tdp_mmu_unlink_page(kvm, sp, shared);
+ tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
u64 *sptep = rcu_dereference(pt) + i;
@@ -372,9 +474,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
- KVM_PAGES_PER_HPAGE(level + 1));
-
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -435,6 +534,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -469,11 +571,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
/*
* Recursively handle child PTs if the change removed a subtree from
- * the paging structure.
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present))
- handle_removed_tdp_mmu_page(kvm,
- spte_to_child_pt(old_spte, level), shared);
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -492,53 +596,72 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* and handle the associated bookkeeping. Do not mark the page dirty
* in KVM's dirty bitmaps.
*
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @new_spte: The value the SPTE should be set to
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
- WARN_ON_ONCE(iter->yielded);
-
- lockdep_assert_held_read(&kvm->mmu_lock);
+ u64 *sptep = rcu_dereference(iter->sptep);
+ u64 old_spte;
/*
- * Do not change removed SPTEs. Only the thread that froze the SPTE
- * may modify it.
+ * The caller is responsible for ensuring the old SPTE is not a REMOVED
+ * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
*/
- if (is_removed_spte(iter->old_spte))
- return false;
+ WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock.
*/
- if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
- new_spte) != iter->old_spte)
- return false;
+ old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
+ if (old_spte != iter->old_spte) {
+ /*
+ * The page table entry was modified by a different logical
+ * CPU. Refresh iter->old_spte with the current value so the
+ * caller operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic().
+ */
+ iter->old_spte = old_spte;
+ return -EBUSY;
+ }
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
- return true;
+ return 0;
}
-static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
+ int ret;
+
/*
* Freeze the SPTE by setting it to a special,
* non-present value. This will stop other threads from
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
- return false;
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ if (ret)
+ return ret;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
KVM_PAGES_PER_HPAGE(iter->level));
@@ -551,17 +674,21 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present
* to non-present.
*/
- WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
+ kvm_tdp_mmu_write_spte(iter->sptep, 0);
- return true;
+ return 0;
}
/*
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
- * @kvm: kvm instance
- * @iter: a tdp_iter instance currently on the SPTE that should be set
- * @new_spte: The value the SPTE should be set to
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
* @record_acc_track: Notify the MM subsystem of changes to the accessed state
* of the page. Should be set unless handling an MMU
* notifier for access tracking. Leaving record_acc_track
@@ -573,58 +700,65 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
*/
-static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
- WARN_ON_ONCE(iter->yielded);
-
lockdep_assert_held_write(&kvm->mmu_lock);
/*
- * No thread should be using this function to set SPTEs to the
+ * No thread should be using this function to set SPTEs to or from the
* temporary removed SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(is_removed_spte(iter->old_spte));
+ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ kvm_tdp_mmu_write_spte(sptep, new_spte);
+
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, false);
if (record_acc_track)
- handle_changed_spte_acc_track(iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
- iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+}
+
+static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte, bool record_acc_track,
+ bool record_dirty_log)
+{
+ WARN_ON_ONCE(iter->yielded);
+
+ __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
+ new_spte, iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
}
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
}
static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
+ for_each_tdp_pte(_iter, _root, _start, _end)
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -634,8 +768,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
- _mmu->shadow_root_level, _start, _end)
+ for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -662,11 +795,11 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return false;
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- rcu_read_unlock();
-
if (flush)
kvm_flush_remote_tlbs(kvm);
+ rcu_read_unlock();
+
if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
@@ -682,85 +815,141 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
+static inline gfn_t tdp_mmu_max_gfn_host(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
+ * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
+ * and so KVM will never install a SPTE for such addresses.
+ */
+ return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ gfn_t end = tdp_mmu_max_gfn_host();
+ gfn_t start = 0;
+
+ for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ goto retry;
+ }
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * To avoid RCU stalls due to recursively removing huge swaths of SPs,
+ * split the zap into two passes. On the first pass, zap at the 1gb
+ * level, and then zap top-level SPs on the second pass. "1gb" is not
+ * arbitrary, as KVM must be able to zap a 1gb shadow page without
+ * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ *
+ * Because zapping a SP recurses on its children, stepping down to
+ * PG_LEVEL_4K in the iterator itself is unnecessary.
+ */
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 old_spte;
+
+ /*
+ * This helper intentionally doesn't allow zapping a root shadow page,
+ * which doesn't have a parent page table and thus no associated entry.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
+ if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ return false;
+
+ __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1, true, true);
+
+ return true;
+}
+
/*
- * Tears down the mappings for the range of gfns, [start, end), and frees the
- * non-root pages mapping GFNs strictly within that range. Returns true if
- * SPTEs have been cleared and a TLB flush is needed before releasing the
- * MMU lock.
+ * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
+ * have been cleared and a TLB flush is needed before releasing the MMU lock.
*
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
* operation can cause a soft lockup.
- *
- * If shared is true, this thread holds the MMU lock in read mode and must
- * account for the possibility that other threads are modifying the paging
- * structures concurrently. If shared is false, this thread should hold the
- * MMU lock in write mode.
*/
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush,
- bool shared)
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
{
- gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool zap_all = (start == 0 && end >= max_gfn_host);
struct tdp_iter iter;
- /*
- * No need to try to step down in the iterator when zapping all SPTEs,
- * zapping the top-level non-leaf SPTEs will recurse on their children.
- */
- int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
-
- /*
- * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
- * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
- * and so KVM will never install a SPTE for such addresses.
- */
- end = min(end, max_gfn_host);
+ end = min(end, tdp_mmu_max_gfn_host());
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
-retry:
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
continue;
}
- if (!is_shadow_present_pte(iter.old_spte))
- continue;
-
- /*
- * If this is a non-last-level SPTE that covers a larger range
- * than should be zapped, continue, and zap the mappings at a
- * lower level, except when zapping all SPTEs.
- */
- if (!zap_all &&
- (iter.gfn < start ||
- iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
+ if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
- if (!shared) {
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush = true;
- } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
- goto retry;
- }
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush = true;
}
rcu_read_unlock();
+
+ /*
+ * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
+ * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
+ */
return flush;
}
@@ -770,112 +959,62 @@ retry:
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
*/
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+ bool can_yield, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
- false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- bool flush = false;
+ struct kvm_mmu_page *root;
int i;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-}
-
-static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
- struct kvm_mmu_page *prev_root)
-{
- struct kvm_mmu_page *next_root;
-
- if (prev_root)
- next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- &prev_root->link,
- typeof(*prev_root), link);
- else
- next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- typeof(*next_root), link);
-
- while (next_root && !(next_root->role.invalid &&
- refcount_read(&next_root->tdp_mmu_root_count)))
- next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- &next_root->link,
- typeof(*next_root), link);
-
- return next_root;
+ /*
+ * Zap all roots, including invalid roots, as all SPTEs must be dropped
+ * before returning to the caller. Zap directly even if the root is
+ * also being zapped by a worker. Walking zapped top-level SPTEs isn't
+ * all that expensive and mmu_lock is already held, which means the
+ * worker has yielded, i.e. flushing the work instead of zapping here
+ * isn't guaranteed to be any faster.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, i)
+ tdp_mmu_zap_root(kvm, root, false);
+ }
}
/*
- * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
- * invalidated root, they will not be freed until this function drops the
- * reference. Before dropping that reference, tear down the paging
- * structure so that whichever thread does drop the last reference
- * only has to do a trivial amount of work. Since the roots are invalid,
- * no new SPTEs should be created under them.
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- struct kvm_mmu_page *next_root;
- struct kvm_mmu_page *root;
- bool flush = false;
-
- lockdep_assert_held_read(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- root = next_invalidated_root(kvm, NULL);
-
- while (root) {
- next_root = next_invalidated_root(kvm, root);
-
- rcu_read_unlock();
-
- flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
-
- /*
- * Put the reference acquired in
- * kvm_tdp_mmu_invalidate_roots
- */
- kvm_tdp_mmu_put_root(kvm, root, true);
-
- root = next_root;
-
- rcu_read_lock();
- }
-
- rcu_read_unlock();
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
+ flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
}
/*
- * Mark each TDP MMU root as invalid so that other threads
- * will drop their references and allow the root count to
- * go to 0.
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is performed asynchronously, so a reference is taken on all roots.
+ * Using a separate workqueue makes it easy to ensure that the destruction is
+ * performed before the "fast zap" completes, without keeping a separate list
+ * of invalidated roots; the list is effectively the list of work items in
+ * the workqueue.
*
- * Also take a reference on all roots so that this thread
- * can do the bulk of the work required to free the roots
- * once they are invalidated. Without this reference, a
- * vCPU thread might drop the last reference to a root and
- * get stuck with tearing down the entire paging structure.
- *
- * Roots which have a zero refcount should be skipped as
- * they're already being torn down.
- * Already invalid roots should be referenced again so that
- * they aren't freed before kvm_tdp_mmu_zap_all_fast is
- * done with them.
+ * Get a reference even if the root is already invalid, the asynchronous worker
+ * assumes it was gifted a reference to the root it processes. Because mmu_lock
+ * is held for write, it should be impossible to observe a root with zero refcount,
+ * i.e. the list of roots cannot be stale.
*
* This has essentially the same effect for the TDP MMU
* as updating mmu_valid_gen does for the shadow MMU.
@@ -885,9 +1024,13 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
- if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!root->role.invalid &&
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
root->role.invalid = true;
+ tdp_mmu_schedule_zap_root(kvm, root);
+ }
+ }
}
/*
@@ -913,8 +1056,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ !is_last_spte(iter->old_spte, iter->level))
+ kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level + 1));
/*
* If the page fault was caused by a write but the page is write
@@ -947,6 +1094,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @account_nx: True if this page table is being installed to split a
+ * non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool account_nx,
+ bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_set_spte(kvm, iter, spte);
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ return 0;
+}
+
+/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
@@ -955,8 +1140,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- u64 *child_pt;
- u64 new_spte;
int ret;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -979,7 +1162,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
break;
/*
@@ -987,10 +1170,13 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
}
if (!is_shadow_present_pte(iter.old_spte)) {
+ bool account_nx = fault->huge_page_disallowed &&
+ fault->req_level >= iter.level;
+
/*
* If SPTE has been frozen by another thread, just
* give up and retry, avoiding unnecessary page table
@@ -999,26 +1185,21 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (is_removed_spte(iter.old_spte))
break;
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
- child_pt = sp->spt;
-
- new_spte = make_nonleaf_spte(child_pt,
- !shadow_accessed_mask);
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp,
- fault->huge_page_disallowed &&
- fault->req_level >= iter.level);
-
- trace_kvm_mmu_get_page(sp, true);
- } else {
+ if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp);
break;
}
}
}
- if (iter.level != fault->goal_level) {
+ /*
+ * Force the guest to retry the access if the upper level SPTEs aren't
+ * in place, or if the target leaf SPTE is frozen by another CPU.
+ */
+ if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
rcu_read_unlock();
return RET_PF_RETRY;
}
@@ -1032,13 +1213,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
- struct kvm_mmu_page *root;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
- flush = zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
-
- return flush;
+ return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
+ range->end, range->may_block, flush);
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
@@ -1052,18 +1228,18 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
struct tdp_iter iter;
bool ret = false;
- rcu_read_lock();
-
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code.
*/
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
ret |= handler(kvm, &iter, range);
- }
- rcu_read_unlock();
+ rcu_read_unlock();
+ }
return ret;
}
@@ -1155,13 +1331,12 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
*/
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-
- /* FIXME: return 'flush' instead of flushing here. */
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
-
- return false;
+ /*
+ * No need to handle the remote TLB flush under RCU protection, the
+ * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
+ * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ */
+ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
/*
@@ -1180,8 +1355,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1193,14 +1367,9 @@ retry:
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1221,13 +1390,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
return spte_set;
}
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+ struct kvm_mmu_page *sp;
+
+ gfp |= __GFP_ZERO;
+
+ sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)__get_free_page(gfp);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since we are allocating while under the MMU lock we have to be
+ * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+ * reclaim and to avoid making any filesystem callbacks (which can end
+ * up invoking KVM MMU notifiers, resulting in a deadlock).
+ *
+ * If this allocation fails we drop the lock and retry with reclaim
+ * allowed.
+ */
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ if (sp)
+ return sp;
+
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ iter->yielded = true;
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ return sp;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ tdp_mmu_init_child_sp(sp, iter);
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ if (!sp) {
+ ret = -ENOMEM;
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, ret);
+ break;
+ }
+
+ if (iter.yielded)
+ continue;
+ }
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return ret;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root, shared);
+ break;
+ }
+ }
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1249,6 +1602,9 @@ retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -1261,14 +1617,9 @@ retry:
continue;
}
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1291,7 +1642,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
@@ -1392,14 +1743,8 @@ retry:
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
- if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_zap_spte_atomic(kvm, &iter))
goto retry;
- }
}
rcu_read_unlock();
@@ -1416,7 +1761,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
zap_collapsible_spte_range(kvm, root, slot);
}
@@ -1436,8 +1781,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3899004a5d91..c163f7cc23ca 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,43 +7,17 @@
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
- if (root->role.invalid)
- return false;
-
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
gfn_t end, bool can_yield, bool flush);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
- gfn_t start, gfn_t end, bool flush)
-{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
-}
-static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1);
-
- /*
- * Don't allow yielding, as the caller may have a flush pending. Note,
- * if mmu_lock is held for write, zapping will never yield in this case,
- * but explicitly disallow it for safety. The TDP MMU does not yield
- * until it has made forward progress (steps sideways), and when zapping
- * a single shadow page that it's guaranteed to see (thus the mmu_lock
- * requirement), its "step sideways" will always step beyond the bounds
- * of the shadow page's gfn range and stop iterating before yielding.
- */
- lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
- sp->gfn, end, false, false);
-}
-
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
@@ -71,6 +45,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{
rcu_read_lock();
@@ -87,14 +66,14 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
u64 *spte);
#ifdef CONFIG_X86_64
-bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
- hpa_t hpa = mmu->root_hpa;
+ hpa_t hpa = mmu->root.hpa;
if (WARN_ON(!VALID_PAGE(hpa)))
return false;
@@ -108,7 +87,7 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
return sp && is_tdp_mmu_page(sp) && sp->root_count;
}
#else
-static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
+static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index f614f95acc6b..eca39f56c231 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -95,9 +95,8 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user,
- bool exclude_kernel, bool intr,
- bool in_tx, bool in_tx_cp)
+ u64 config, bool exclude_user,
+ bool exclude_kernel, bool intr)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -116,16 +115,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
attr.sample_period = get_sample_period(pmc, pmc->counter);
- if (in_tx)
- attr.config |= HSW_IN_TX;
- if (in_tx_cp) {
+ if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
+ guest_cpuid_is_intel(pmc->vcpu)) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
* allocating the counter doesn't fail.
*/
attr.sample_period = 0;
- attr.config |= HSW_IN_TX_CHECKPOINTED;
}
event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -181,9 +178,11 @@ static int cmp_u64(const void *a, const void *b)
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
- unsigned config, type = PERF_TYPE_RAW;
+ u64 config;
+ u32 type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
+ struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -220,7 +219,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & pmu->raw_event_mask;
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
return;
@@ -231,9 +230,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT,
- (eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
}
EXPORT_SYMBOL_GPL(reprogram_gp_counter);
@@ -269,7 +266,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmi);
}
EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7a7b8d5b775e..9e66fba1d6a3 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -15,8 +15,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-#define MAX_FIXED_COUNTERS 3
-
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 90364d02f22a..a1cf9c31273b 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -27,20 +27,6 @@
#include "irq.h"
#include "svm.h"
-#define SVM_AVIC_DOORBELL 0xc001011b
-
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe. APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT 255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
-
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS 8
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
@@ -73,12 +59,6 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
-enum avic_ipi_failure_cause {
- AVIC_IPI_FAILURE_INVALID_INT_TYPE,
- AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
- AVIC_IPI_FAILURE_INVALID_TARGET,
- AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
/* Note:
* This function is called from IOMMU driver to notify
@@ -289,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu())
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ put_cpu();
+}
+
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh)
{
@@ -304,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
kvm_for_each_vcpu(i, vcpu, kvm) {
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK))
- kvm_vcpu_wake_up(vcpu);
+ icrl & APIC_DEST_MASK)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
}
}
@@ -323,18 +324,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
- * AVIC hardware handles the generation of
- * IPIs when the specified Message Type is Fixed
- * (also known as fixed delivery mode) and
- * the Trigger Mode is edge-triggered. The hardware
- * also supports self and broadcast delivery modes
- * specified via the Destination Shorthand(DSH)
- * field of the ICRL. Logical and physical APIC ID
- * formats are supported. All other IPI types cause
- * a #VMEXIT, which needs to emulated.
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
+ * a trap, e.g. ICR holds the correct value and RIP has been
+ * advanced, KVM is responsible only for emulating the IPI.
+ * Sadly, hardware may sometimes leave the BUSY flag set, in
+ * which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
@@ -345,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -478,30 +477,28 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
svm->dfr_reg = dfr;
}
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
- u32 offset = svm->vmcb->control.exit_info_1 &
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
case APIC_ID:
- if (avic_handle_apic_id_update(&svm->vcpu))
+ if (avic_handle_apic_id_update(vcpu))
return 0;
break;
case APIC_LDR:
- if (avic_handle_ldr_update(&svm->vcpu))
+ if (avic_handle_ldr_update(vcpu))
return 0;
break;
case APIC_DFR:
- avic_handle_dfr_update(&svm->vcpu);
+ avic_handle_dfr_update(vcpu);
break;
default:
break;
}
- kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
+ kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -551,7 +548,7 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
if (trap) {
/* Handling Trap */
WARN_ONCE(!write, "svm: Handling trap read.\n");
- ret = avic_unaccel_trap_write(svm);
+ ret = avic_unaccel_trap_write(vcpu);
} else {
/* Handling Fault */
ret = kvm_emulate_instruction(vcpu, 0);
@@ -579,7 +576,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-void avic_post_state_restore(struct kvm_vcpu *vcpu)
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
@@ -587,20 +584,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
unsigned long flags;
@@ -632,94 +616,6 @@ out:
return ret;
}
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
- if (!enable_apicv)
- return;
-
- if (activated) {
- /**
- * During AVIC temporary deactivation, guest could update
- * APIC ID, DFR and LDR registers, which would not be trapped
- * by avic_unaccelerated_access_interception(). In this case,
- * we need to check and update the AVIC logical APIC ID table
- * accordingly before re-activating.
- */
- avic_post_state_restore(vcpu);
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- } else {
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- }
- vmcb_mark_dirty(vmcb, VMCB_AVIC);
-
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
-
- svm_set_pi_irte_mode(vcpu, activated);
-}
-
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
- return;
-}
-
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
- if (!vcpu->arch.apicv_active)
- return -1;
-
- kvm_lapic_set_irr(vec, vcpu->arch.apic);
-
- /*
- * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
- * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
- * the read of guest_mode, which guarantees that either VMRUN will see
- * and process the new vIRR entry, or that the below code will signal
- * the doorbell if the vCPU is already running in the guest.
- */
- smp_mb__after_atomic();
-
- /*
- * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
- * is in the guest. If the vCPU is not in the guest, hardware will
- * automatically process AVIC interrupts at VMRUN.
- */
- if (vcpu->mode == IN_GUEST_MODE) {
- int cpu = READ_ONCE(vcpu->cpu);
-
- /*
- * Note, the vCPU could get migrated to a different pCPU at any
- * point, which could result in signalling the wrong/previous
- * pCPU. But if that happens the vCPU is guaranteed to do a
- * VMRUN (after being migrated) and thus will process pending
- * interrupts, i.e. a doorbell is not needed (and the spurious
- * one is harmless).
- */
- if (cpu != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
- put_cpu();
- } else {
- /*
- * Wake the vCPU if it was blocking. KVM will then detect the
- * pending IRQ when checking if the vCPU has a wake event.
- */
- kvm_vcpu_wake_up(vcpu);
- }
-
- return 0;
-}
-
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
@@ -817,7 +713,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
}
/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -825,12 +721,12 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
- int idx, ret = -EINVAL;
+ int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP))
@@ -841,7 +737,13 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
struct vcpu_data vcpu_info;
@@ -926,7 +828,7 @@ out:
return ret;
}
-bool svm_check_apicv_inhibit_reasons(ulong bit)
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
@@ -937,7 +839,7 @@ bool svm_check_apicv_inhibit_reasons(ulong bit)
BIT(APICV_INHIBIT_REASON_X2APIC) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
- return supported & BIT(bit);
+ return supported & BIT(reason);
}
@@ -971,20 +873,15 @@ out:
return ret;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
@@ -1008,7 +905,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void __avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1027,13 +924,63 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
+static void avic_vcpu_load(struct kvm_vcpu *vcpu)
+{
+ int cpu = get_cpu();
+
+ WARN_ON(cpu != vcpu->cpu);
+
+ __avic_vcpu_load(vcpu, cpu);
+
+ put_cpu();
+}
+
+static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+
+ __avic_vcpu_put(vcpu);
+
+ preempt_enable();
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ } else {
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+
+ if (activated)
+ avic_vcpu_load(vcpu);
+ else
+ avic_vcpu_put(vcpu);
+
+ avic_set_pi_irte_mode(vcpu, activated);
+}
+
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
- preempt_disable();
-
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
@@ -1048,21 +995,12 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
-
- preempt_enable();
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- int cpu;
-
if (!kvm_vcpu_apicv_active(vcpu))
return;
- cpu = get_cpu();
- WARN_ON(cpu != vcpu->cpu);
-
- avic_vcpu_load(vcpu, cpu);
-
- put_cpu();
+ avic_vcpu_load(vcpu);
}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..7d6d97968fb9
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB
+ * control area to expose SVM enlightenments to guests.
+ */
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1218b5a342fc..96bab464967f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -28,6 +28,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
+#include "hyperv.h"
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
@@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ int i;
+
/*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is optimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- int i;
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
@@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
@@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
}
static
-void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
struct vmcb_control_area *from)
{
unsigned int i;
@@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
to->asid = from->asid;
to->msrpm_base_pa &= ~0x0fffULL;
to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(to->reserved_sw, from->reserved_sw,
+ sizeof(struct hv_enlightenments));
+ }
}
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control)
{
- __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
}
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
@@ -464,14 +492,14 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
CC(!load_pdptrs(vcpu, cr3)))
return -EINVAL;
- if (!nested_npt)
- kvm_mmu_new_pgd(vcpu, cr3);
-
vcpu->arch.cr3 = cr3;
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
return 0;
}
@@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
new_vmcb12 = true;
svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
@@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'reserved_sw' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl);
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
@@ -1457,18 +1487,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
!__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
- /*
- * While the nested guest CR3 is already checked and set by
- * KVM_SET_SREGS, it was set when nested state was yet loaded,
- * thus MMU might not be initialized correctly.
- * Set it again to fix this.
- */
-
- ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
- nested_npt_enabled(svm), false);
- if (WARN_ON_ONCE(ret))
- goto out_free;
-
/*
* All checks done, we can enter guest mode. Userspace provides
@@ -1494,6 +1512,21 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm_switch_vmcb(svm, &svm->nested.vmcb02);
nested_vmcb02_prepare_control(svm);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
out_free:
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 5aa45f13b16d..24eb935b6f85 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
- if (!enable_pmu)
+ if (!vcpu->kvm->arch.enable_pmu)
return NULL;
switch (msr) {
@@ -262,12 +262,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
reprogram_gp_counter(pmc, data);
- return 0;
- }
+ return 0;
}
return 1;
@@ -284,6 +282,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 17b53457d866..75fa6dd268f0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;
INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
return 0;
@@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
}
}
-static void sev_migrate_from(struct kvm_sev_info *dst,
- struct kvm_sev_info *src)
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *mirror;
+
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
@@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
src->enc_context_owner = NULL;
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
return 0;
}
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
@@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
src_sev = &to_kvm_svm(source_kvm)->sev_info;
- /*
- * VMs mirroring src's encryption context rely on it to keep the
- * ASID allocated, but below we are clearing src_sev->asid.
- */
- if (src_sev->num_mirrored_vms) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
}
- sev_migrate_from(dst_sev, src_sev);
+
+ sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
@@ -1761,7 +1781,7 @@ out_fput:
return ret;
}
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
@@ -1858,8 +1878,8 @@ out:
return r;
}
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
@@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
kfree(region);
}
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct enc_region *region;
int ret;
@@ -1972,7 +1992,7 @@ failed:
return ret;
}
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
@@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
*/
source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
- source_sev->num_mirrored_vms++;
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
@@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;
/*
@@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
- WARN_ON(sev->num_mirrored_vms);
-
if (!sev_guest(kvm))
return;
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
- struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
mutex_lock(&owner_kvm->lock);
- if (!WARN_ON(!owner_sev->num_mirrored_vms))
- owner_sev->num_mirrored_vms--;
+ list_del(&sev->mirror_entry);
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
return;
@@ -2173,7 +2192,7 @@ out:
#endif
}
-void sev_hardware_teardown(void)
+void sev_hardware_unsetup(void)
{
if (!sev_enabled)
return;
@@ -2358,7 +2377,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
@@ -2463,7 +2482,7 @@ static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
goto vmgexit_err;
}
- return true;
+ return 0;
vmgexit_err:
vcpu = &svm->vcpu;
@@ -2486,7 +2505,8 @@ vmgexit_err:
ghcb_set_sw_exit_info_1(ghcb, 2);
ghcb_set_sw_exit_info_2(ghcb, reason);
- return false;
+ /* Resume the guest to "return" the error code. */
+ return 1;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
@@ -2545,7 +2565,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
}
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
-static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct ghcb *ghcb = svm->sev_es.ghcb;
@@ -2598,14 +2618,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
}
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- goto e_scratch;
+ return -ENOMEM;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
kvfree(scratch_va);
- goto e_scratch;
+ return -EFAULT;
}
/*
@@ -2621,13 +2641,13 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
svm->sev_es.ghcb_sa = scratch_va;
svm->sev_es.ghcb_sa_len = len;
- return true;
+ return 0;
e_scratch:
ghcb_set_sw_exit_info_1(ghcb, 2);
ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
- return false;
+ return 1;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2765,17 +2785,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
exit_code = ghcb_get_sw_exit_code(ghcb);
- if (!sev_es_validate_vmgexit(svm))
- return 1;
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
- if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
break;
ret = kvm_sev_es_mmio_read(vcpu,
@@ -2784,7 +2805,8 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_MMIO_WRITE:
- if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
break;
ret = kvm_sev_es_mmio_write(vcpu,
@@ -2817,6 +2839,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
+ ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
@@ -2836,6 +2859,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
int count;
int bytes;
+ int r;
if (svm->vmcb->control.exit_info_2 > INT_MAX)
return -EINVAL;
@@ -2844,8 +2868,9 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
if (unlikely(check_mul_overflow(count, size, &bytes)))
return -EINVAL;
- if (!setup_vmgexit_scratch(svm, in, bytes))
- return 1;
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
@@ -2907,20 +2932,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- struct vmcb_save_area *hostsa;
-
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
- * of which one step is to perform a VMLOAD. Since hardware does not
- * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ * of which one step is to perform a VMLOAD. KVM performs the
+ * corresponding VMSAVE in svm_prepare_guest_switch for both
+ * traditional and SEV-ES guests.
*/
- vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
- hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
/* PKRU is restored on VMEXIT, save the current host value */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6d97629655e3..bd4c64b362d2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -62,20 +62,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_TSC_RATE (1 << 4)
-#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
-#define SVM_FEATURE_FLUSH_ASID (1 << 6)
-#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
-
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-#define TSC_RATIO_RSVD 0xffffff0000000000ULL
-#define TSC_RATIO_MIN 0x0000000000000001ULL
-#define TSC_RATIO_MAX 0x000000ffffffffffULL
-
static bool erratum_383_found __read_mostly;
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@ -87,7 +75,6 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define TSC_RATIO_DEFAULT 0x0100000000ULL
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
@@ -263,7 +250,7 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-#define MAX_INST_SIZE 15
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
static int get_npt_level(void)
{
@@ -353,7 +340,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -401,7 +388,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(vcpu);
+ (void)svm_skip_emulated_instruction(vcpu);
rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
@@ -480,7 +467,7 @@ static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -526,8 +513,8 @@ static int svm_hardware_enable(void)
* Set the default value, even if we don't use TSC scaling
* to avoid having stale value in the msr
*/
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
- __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
+ wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
+ __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT);
}
@@ -668,6 +655,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
u32 msr, int read, int write)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
@@ -698,7 +686,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
msrpm[offset] = tmp;
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
-
+ svm->nested.force_msr_bitmap_recalc = true;
}
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -873,11 +861,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void svm_hardware_teardown(void)
+static void svm_hardware_unsetup(void)
{
int cpu;
- sev_hardware_teardown();
+ sev_hardware_unsetup();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1175,7 +1163,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
-static int svm_create_vcpu(struct kvm_vcpu *vcpu)
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb01_page;
@@ -1246,7 +1234,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1265,7 +1253,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
@@ -1280,10 +1268,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
+ vmsave(__sme_page_pa(sd->save_area));
if (sev_es_guest(vcpu->kvm)) {
- sev_es_prepare_guest_switch(svm, vcpu->cpu);
- } else {
- vmsave(__sme_page_pa(sd->save_area));
+ struct vmcb_save_area *hostsa;
+ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+
+ sev_es_prepare_switch_to_guest(hostsa);
}
if (tsc_scaling) {
@@ -1315,13 +1305,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
- avic_vcpu_load(vcpu, cpu);
+ __avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
- avic_vcpu_put(vcpu);
+ __avic_vcpu_put(vcpu);
svm_prepare_host_switch(vcpu);
@@ -1529,6 +1519,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1563,7 +1562,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1585,6 +1584,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1601,8 +1601,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
#endif
vcpu->arch.cr0 = cr0;
- if (!npt_enabled)
+ if (!npt_enabled) {
hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/*
* re-enable caching here because the QEMU bios
@@ -1643,11 +1646,15 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ svm_flush_tlb_current(vcpu);
vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
+ if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -2261,7 +2268,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(vcpu))
+ if (!svm_skip_emulated_instruction(vcpu))
return 0;
}
@@ -2685,10 +2692,25 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!msr->host_initiated && !svm->tsc_scaling_enabled)
- return 1;
- if (data & TSC_RATIO_RSVD)
+ if (!svm->tsc_scaling_enabled) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
return 1;
svm->tsc_ratio_msr = data;
@@ -2883,7 +2905,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
*/
- kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
+ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
++vcpu->stat.irq_window_exits;
return 1;
@@ -3126,7 +3148,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(u64 exit_code)
{
return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
svm_exit_handlers[exit_code]);
@@ -3146,7 +3168,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (!svm_check_exit_valid(vcpu, exit_code))
+ if (!svm_check_exit_valid(exit_code))
return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
@@ -3181,7 +3203,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
*error_code = 0;
}
-static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -3278,7 +3300,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
++vcpu->stat.nmi_injections;
}
-static void svm_set_irq(struct kvm_vcpu *vcpu)
+static void svm_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3291,6 +3313,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
+{
+ /*
+ * vcpu->arch.apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
+
+ if (!READ_ONCE(vcpu->arch.apicv_active)) {
+ /* Process the interrupt via inject_pending_event */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3338,11 +3409,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return -EBUSY;
-
- return !svm_nmi_blocked(vcpu);
+ return 1;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3394,9 +3467,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
+
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
/*
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
* e.g. if the IRQ arrived asynchronously after checking nested events.
@@ -3404,7 +3481,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
return -EBUSY;
- return !svm_interrupt_blocked(vcpu);
+ return 1;
}
static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3426,7 +3503,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* via AVIC. In such case, we need to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
*/
- kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
svm_set_vintr(svm);
}
}
@@ -3453,17 +3530,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
- return 0;
-}
-
-void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3615,7 +3682,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long vmcb_pa = svm->current_vmcb->pa;
- kvm_guest_enter_irqoff();
+ guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
__svm_sev_es_vcpu_run(vmcb_pa);
@@ -3635,7 +3702,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
vmload(__sme_page_pa(sd->save_area));
}
- kvm_guest_exit_irqoff();
+ guest_state_exit_irqoff();
}
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@ -3837,11 +3904,6 @@ static int __init svm_check_processor_compat(void)
return 0;
}
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -3873,6 +3935,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_cpuid_entry2 *best;
+ struct kvm *kvm = vcpu->kvm;
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -3899,16 +3962,14 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* is exposed to the guest, disable AVIC.
*/
if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_X2APIC);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
/*
* Currently, AVIC does not work with nested virtualization.
* So, we disable AVIC when cpuid for SVM is set in the L1 guest.
*/
if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_NESTED);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
}
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4135,11 +4196,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
/* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
return -EBUSY;
- return !svm_smi_blocked(vcpu);
+ return 1;
}
static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -4173,7 +4237,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* by 0x400 (matches the offset of 'struct vmcb_save_area'
* within 'struct vmcb'). Note: HSAVE area may also be used by
* L1 hypervisor to save additional host context (e.g. KVM does
- * that, see svm_prepare_guest_switch()) which must be
+ * that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
@@ -4233,11 +4297,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
* Enter the nested guest now
*/
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save, true);
unmap_map:
@@ -4441,21 +4512,20 @@ static int svm_vm_init(struct kvm *kvm)
static struct kvm_x86_ops svm_x86_ops __initdata = {
.name = "kvm_amd",
- .hardware_unsetup = svm_hardware_teardown,
+ .hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.has_emulated_msr = svm_has_emulated_msr,
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
- .prepare_guest_switch = svm_prepare_guest_switch,
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
.vcpu_blocking = avic_vcpu_blocking,
@@ -4469,9 +4539,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
- .post_set_cr3 = svm_post_set_cr3,
+ .post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4486,21 +4556,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .tlb_flush_all = svm_flush_tlb,
- .tlb_flush_current = svm_flush_tlb,
- .tlb_flush_gva = svm_flush_tlb_gva,
- .tlb_flush_guest = svm_flush_tlb,
+ .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_current,
.vcpu_pre_run = svm_vcpu_pre_run,
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
.update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4510,18 +4580,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .load_eoi_exitmap = svm_load_eoi_exitmap,
- .hwapic_irr_update = svm_hwapic_irr_update,
- .hwapic_isr_update = svm_hwapic_isr_update,
- .apicv_post_state_restore = avic_post_state_restore,
-
- .set_tss_addr = svm_set_tss_addr,
- .set_identity_map_addr = svm_set_identity_map_addr,
- .get_mt_mask = svm_get_mt_mask,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+ .get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
@@ -4545,9 +4608,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.pmu_ops = &amd_pmu_ops,
.nested_ops = &svm_nested_ops,
- .deliver_posted_interrupt = svm_deliver_avic_intr,
- .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
- .update_pi_irte = svm_update_pi_irte,
+ .deliver_interrupt = svm_deliver_interrupt,
+ .pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
.smi_allowed = svm_smi_allowed,
@@ -4555,12 +4617,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
- .mem_enc_op = svm_mem_enc_op,
- .mem_enc_reg_region = svm_register_enc_region,
- .mem_enc_unreg_region = svm_unregister_enc_region,
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
- .vm_copy_enc_context_from = svm_vm_copy_asid_from,
- .vm_move_enc_context_from = svm_vm_migrate_from,
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
.can_emulate_instruction = svm_can_emulate_instruction,
@@ -4622,6 +4684,7 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
@@ -4689,10 +4752,10 @@ static __init int svm_hardware_setup(void)
} else {
pr_info("TSC scaling supported\n");
kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
}
}
+ kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
@@ -4804,7 +4867,7 @@ static __init int svm_hardware_setup(void)
return 0;
err:
- svm_hardware_teardown();
+ svm_hardware_unsetup();
return r;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 73525353e424..f77a7d2d39dd 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -22,6 +22,8 @@
#include <asm/svm.h>
#include <asm/sev-common.h>
+#include "kvm_cache_regs.h"
+
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
#define IOPM_SIZE PAGE_SIZE * 3
@@ -79,7 +81,8 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
- unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
@@ -137,6 +140,8 @@ struct vmcb_ctrl_area_cached {
u32 event_inj_err;
u64 nested_cr3;
u64 virt_ext;
+ u32 clean;
+ u8 reserved_sw[32];
};
struct svm_nested_state {
@@ -163,6 +168,15 @@ struct svm_nested_state {
struct vmcb_save_area_cached save;
bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
};
struct vcpu_sev_es_state {
@@ -321,7 +335,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
/*
* Only the PDPTRs are loaded on demand into the shadow MMU. All other
- * fields are synchronized in handle_exit, because accessing the VMCB is cheap.
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
*
* CR3 might be out of date in the VMCB but it is not marked dirty; instead,
* KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
@@ -480,7 +494,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -489,6 +502,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
/* nested.c */
@@ -556,17 +571,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
-
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -574,21 +578,20 @@ void avic_init_vmcb(struct vcpu_svm *svm);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void avic_vcpu_put(struct kvm_vcpu *vcpu);
-void avic_post_state_restore(struct kvm_vcpu *vcpu);
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
+void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
/* sev.c */
@@ -599,17 +602,17 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
extern unsigned int max_sev_asid;
void sev_vm_destroy(struct kvm *kvm);
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
-void sev_hardware_teardown(void);
+void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
@@ -617,7 +620,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 98aa981c04ec..8cdc62c74a96 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -4,7 +4,6 @@
*/
#include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
#include <asm/mshyperv.h>
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 489ca56212c6..e2fc59380465 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -7,35 +7,12 @@
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-#include <asm/mshyperv.h>
-#include "hyperv.h"
#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
static struct kvm_x86_ops svm_x86_ops;
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
-
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
-
int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 92e6f6702f00..e3a24b8f04be 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall,
* Tracepoint for hypercall.
*/
TRACE_EVENT(kvm_hv_hypercall,
- TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
- __u64 ingpa, __u64 outgpa),
- TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
TP_STRUCT__entry(
__field( __u16, rep_cnt )
@@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall,
__field( __u64, ingpa )
__field( __u64, outgpa )
__field( __u16, code )
+ __field( __u16, var_cnt )
__field( bool, fast )
),
@@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall,
__entry->ingpa = ingpa;
__entry->outgpa = outgpa;
__entry->code = code;
+ __entry->var_cnt = var_cnt;
__entry->fast = fast;
),
- TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
__entry->code, __entry->fast ? "fast" : "slow",
- __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
- __entry->outgpa)
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
);
TRACE_EVENT(kvm_hv_hypercall_done,
@@ -251,13 +253,13 @@ TRACE_EVENT(kvm_cpuid,
* Tracepoint for apic access.
*/
TRACE_EVENT(kvm_apic,
- TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
TP_ARGS(rw, reg, val),
TP_STRUCT__entry(
__field( unsigned int, rw )
__field( unsigned int, reg )
- __field( unsigned int, val )
+ __field( u64, val )
),
TP_fast_assign(
@@ -266,7 +268,7 @@ TRACE_EVENT(kvm_apic,
__entry->val = val;
),
- TP_printk("apic_%s %s = 0x%x",
+ TP_printk("apic_%s %s = 0x%llx",
__entry->rw ? "write" : "read",
__print_symbolic(__entry->reg, kvm_trace_symbol_apic),
__entry->val)
@@ -1337,23 +1339,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
-TRACE_EVENT(kvm_apicv_update_request,
- TP_PROTO(bool activate, unsigned long bit),
- TP_ARGS(activate, bit),
+TRACE_EVENT(kvm_apicv_inhibit_changed,
+ TP_PROTO(int reason, bool set, unsigned long inhibits),
+ TP_ARGS(reason, set, inhibits),
TP_STRUCT__entry(
- __field(bool, activate)
- __field(unsigned long, bit)
+ __field(int, reason)
+ __field(bool, set)
+ __field(unsigned long, inhibits)
),
TP_fast_assign(
- __entry->activate = activate;
- __entry->bit = bit;
+ __entry->reason = reason;
+ __entry->set = set;
+ __entry->inhibits = inhibits;
),
- TP_printk("%s bit=%lu",
- __entry->activate ? "activate" : "deactivate",
- __entry->bit)
+ TP_printk("%s reason=%u, inhibits=0x%lx",
+ __entry->set ? "set" : "cleared",
+ __entry->reason, __entry->inhibits)
);
TRACE_EVENT(kvm_apicv_accept_irq,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ba34e94049c7..f18744f7ff82 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -246,8 +246,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
src = &prev->host_state;
dest = &vmx->loaded_vmcs->host_state;
- vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel,
- src->fs_base, src->gs_base);
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
dest->ldt_sel = src->ldt_sel;
#ifdef CONFIG_X86_64
dest->ds_sel = src->ds_sel;
@@ -321,7 +320,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
nested_release_evmcs(vcpu);
@@ -1126,15 +1125,15 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return -EINVAL;
}
- if (!nested_ept)
- kvm_mmu_new_pgd(vcpu, cr3);
-
vcpu->arch.cr3 = cr3;
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
+ if (!nested_ept)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
return 0;
}
@@ -3056,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr4;
+ unsigned long cr3, cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3079,6 +3078,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -4797,7 +4802,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl)
{
struct vcpu_vmx *vmx;
@@ -4805,7 +4811,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
return;
vmx = to_vmx(vcpu);
- if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ if (vcpu_has_perf_global_ctrl) {
vmx->nested.msrs.entry_ctls_high |=
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high |=
@@ -5006,7 +5012,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr >> PAGE_SHIFT,
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
vmx->nested.current_vmptr = INVALID_GPA;
}
@@ -5465,7 +5471,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0;
- if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+ if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
operand.eptp))
roots_to_free |= KVM_MMU_ROOT_CURRENT;
@@ -5485,7 +5491,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
if (roots_to_free)
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
return nested_vmx_succeed(vcpu);
}
@@ -5574,7 +5580,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
*/
if (!enable_ept)
- kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu);
+ kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
return nested_vmx_succeed(vcpu);
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b69a80f43b37..c92cea0b8ccc 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl);
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 466d18fc0c5d..bc3f8512bb64 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -389,6 +389,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct kvm_pmc *pmc;
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ u64 reserved_bits;
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -443,7 +444,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
return 0;
- if (!(data & pmu->reserved_bits)) {
+ reserved_bits = pmu->reserved_bits;
+ if ((pmc->idx == 2) &&
+ (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
+ reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
+ if (!(data & reserved_bits)) {
reprogram_gp_counter(pmc, data);
return 0;
}
@@ -485,9 +490,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry || !enable_pmu)
+ if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
@@ -533,15 +539,18 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
- (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
- pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
+ pmu->reserved_bits ^= HSW_IN_TX;
+ pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+ }
bitmap_set(pmu->all_valid_pmc_idx,
0, pmu->nr_arch_gp_counters);
bitmap_set(pmu->all_valid_pmc_idx,
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
- nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+ nested_vmx_pmu_refresh(vcpu,
+ intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
if (intel_pmu_lbr_is_compatible(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
@@ -565,7 +574,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
@@ -591,7 +600,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc->counter = pmc->eventsel = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index aa1fe9085d77..3834bb30ce54 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm)
}
/*
- * pi_update_irte - set IRTE for Posted-Interrupts
+ * vmx_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm)
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set)
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index eb14e76b84ef..9a45d5c9f116 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set);
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aca3ae2a02f3..04d170c4b61e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
static int possible_passthrough_msr_slot(u32 msr)
{
u32 i;
@@ -645,10 +640,10 @@ static void __loaded_vmcs_clear(void *arg)
/*
* Ensure all writes to loaded_vmcs, including deleting it from its
- * current percpu list, complete before setting loaded_vmcs->vcpu to
- * -1, otherwise a different cpu can see vcpu == -1 first and add
- * loaded_vmcs to its percpu list before it's deleted from this cpu's
- * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
@@ -1080,14 +1075,9 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
-void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
- u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base)
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
{
- if (unlikely(cr3 != host->cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- host->cr3 = cr3;
- }
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
@@ -1182,9 +1172,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
- fs_sel, gs_sel, fs_base, gs_base);
-
+ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
vmx->guest_state_loaded = true;
}
@@ -2341,7 +2329,7 @@ fault:
return -EFAULT;
}
-static int hardware_enable(void)
+static int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2382,7 +2370,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void hardware_disable(void)
+static void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -2878,21 +2866,17 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
/* Nothing to do if hardware doesn't support EFER. */
- if (!msr)
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
return 0;
vcpu->arch.efer = efer;
- if (efer & EFER_LMA) {
- vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
- msr->data = efer & ~EFER_LME;
- }
vmx_setup_uret_msrs(vmx);
return 0;
}
@@ -2918,7 +2902,6 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
@@ -2957,7 +2940,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 root_hpa = mmu->root_hpa;
+ u64 root_hpa = mmu->root.hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
@@ -3937,31 +3920,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
/*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
*
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
*
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
*
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
*
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
*/
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ if (vcpu != kvm_get_running_vcpu())
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return;
}
#endif
@@ -4041,6 +4026,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
+static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
+}
+
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -5169,7 +5169,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ if (vmx_get_cpl(vcpu) > 0)
goto out;
dr7 = vmcs_readl(GUEST_DR7);
@@ -5302,9 +5302,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
- u32 offset = exit_qualification & 0xfff;
- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -6754,7 +6761,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
- kvm_guest_enter_irqoff();
+ guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6770,13 +6777,13 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
- kvm_guest_exit_irqoff();
+ guest_state_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr4;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6819,6 +6826,19 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -6954,7 +6974,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return vmx_exit_handlers_fastpath(vcpu);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6965,7 +6985,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_loaded_vmcs(vmx->loaded_vmcs);
}
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7327,11 +7347,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ vmx->msr_ia32_feature_control_valid_bits &=
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
@@ -7644,6 +7664,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (ret)
return ret;
+ vmx->nested.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
@@ -7669,7 +7690,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void hardware_unsetup(void)
+static void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -7679,34 +7700,33 @@ static void hardware_unsetup(void)
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
- return supported & BIT(bit);
+ return supported & BIT(reason);
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
- .hardware_unsetup = hardware_unsetup,
+ .hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
@@ -7734,21 +7754,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_rflags = vmx_set_rflags,
.get_if_flag = vmx_get_if_flag,
- .tlb_flush_all = vmx_flush_tlb_all,
- .tlb_flush_current = vmx_flush_tlb_current,
- .tlb_flush_gva = vmx_flush_tlb_gva,
- .tlb_flush_guest = vmx_flush_tlb_guest,
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
- .run = vmx_vcpu_run,
+ .vcpu_run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
@@ -7768,7 +7788,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
@@ -7801,8 +7821,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = pi_update_irte,
- .start_assignment = vmx_pi_start_assignment,
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
@@ -7955,12 +7975,11 @@ static __init int hardware_setup(void)
if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- if (cpu_has_vmx_tsc_scaling()) {
+ if (cpu_has_vmx_tsc_scaling())
kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- }
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
@@ -8037,7 +8056,7 @@ static __init int hardware_setup(void)
vmx_set_cpu_caps();
r = alloc_kvm_area();
- if (r)
+ if (r && nested)
nested_vmx_hardware_unsetup();
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
@@ -8117,7 +8136,6 @@ static int __init vmx_init(void)
ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
- int cpu;
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) {
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7f2c82e7f38f..9c6bfcd84008 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -374,9 +374,8 @@ int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
-void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
- u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base);
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74b53a16f38a..0c0ca599a353 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,6 +90,8 @@
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
#define emul_to_vcpu(ctxt) \
((struct kvm_vcpu *)(ctxt)->vcpu)
@@ -108,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -124,16 +128,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
#define KVM_X86_OP(func) \
DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
*(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
@@ -192,6 +195,9 @@ bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -758,7 +764,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root_hpa);
+ fault_mmu->root.hpa);
fault_mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
@@ -851,7 +857,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
* Shadow page roots need to be reconstructed instead.
*/
if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
- kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
@@ -867,6 +873,13 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
+
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
@@ -982,6 +995,18 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
+static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
+}
+
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+ return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
@@ -1001,7 +1026,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
- valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
@@ -1053,28 +1078,43 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
/*
- * If any role bit is changed, the MMU needs to be reset.
- *
- * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed.
* If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
* according to the SDM; however, stale prev_roots could be reused
* incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
- * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it
- * is slow, but changing CR4.PCIDE is a rare case.
- *
- * If CR4.PGE is changed, the guest TLB must be flushed.
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
+ /*
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
*
- * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and
- * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence
- * the usage of "else if".
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
*/
- if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
- kvm_mmu_reset_context(vcpu);
- else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE)
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
- else if ((cr4 ^ old_cr4) & X86_CR4_PGE)
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
@@ -1152,7 +1192,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
}
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
@@ -1642,8 +1682,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -1709,9 +1748,6 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
{
struct msr_data msr;
- if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return KVM_MSR_RET_FILTERED;
-
switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
@@ -1735,7 +1771,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
break;
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
@@ -1793,9 +1829,6 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
struct msr_data msr;
int ret;
- if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return KVM_MSR_RET_FILTERED;
-
switch (index) {
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
@@ -1832,6 +1865,20 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
return ret;
}
+static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+
+static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
return kvm_get_msr_ignored_check(vcpu, index, data, false);
@@ -1914,7 +1961,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
u64 data;
int r;
- r = kvm_get_msr(vcpu, ecx, &data);
+ r = kvm_get_msr_with_filter(vcpu, ecx, &data);
if (!r) {
trace_kvm_msr_read(ecx, data);
@@ -1939,7 +1986,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
u64 data = kvm_read_edx_eax(vcpu);
int r;
- r = kvm_set_msr(vcpu, ecx, data);
+ r = kvm_set_msr_with_filter(vcpu, ecx, data);
if (!r) {
trace_kvm_msr_write(ecx, data);
@@ -2012,17 +2059,10 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
return 1;
if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
- ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
- ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
- ((u32)(data >> 32) != X2APIC_BROADCAST)) {
-
- data &= ~(1 << 12);
- kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
- trace_kvm_apic_write(APIC_ICR, (u32)data);
- return 0;
- }
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST))
+ return kvm_x2apic_icr_write(vcpu->arch.apic, data);
return 1;
}
@@ -2349,10 +2389,12 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
+#ifdef CONFIG_X86_64
static inline int gtod_is_based_on_tsc(int mode)
{
return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
+#endif
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
@@ -2397,7 +2439,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
@@ -2412,7 +2454,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
@@ -2420,7 +2462,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
return vcpu->arch.l1_tsc_offset +
- kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
@@ -2623,7 +2665,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ adjustment = kvm_scale_tsc((u64) adjustment,
vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -3043,7 +3085,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
@@ -3266,7 +3308,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_all)(vcpu);
+ static_call(kvm_x86_flush_tlb_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3284,14 +3326,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_tlb_flush_guest)(vcpu);
+ static_call(kvm_x86_flush_tlb_guest)(vcpu);
}
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
}
/*
@@ -3704,8 +3746,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_XFD))
return 1;
- if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
- vcpu->arch.guest_supported_xcr0))
+ if (data & ~kvm_guest_supported_xfd(vcpu))
return 1;
fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
@@ -3715,8 +3756,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_XFD))
return 1;
- if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
- vcpu->arch.guest_supported_xcr0))
+ if (data & ~kvm_guest_supported_xfd(vcpu))
return 1;
vcpu->arch.guest_fpu.xfd_err = data;
@@ -3855,7 +3895,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ratio = vcpu->arch.tsc_scaling_ratio;
}
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -4231,6 +4271,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
+ case KVM_CAP_ENABLE_CAP:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4271,9 +4313,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
- case KVM_CAP_VAPIC:
- r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
- break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
break;
@@ -4328,7 +4367,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+ break;
}
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = KVM_X86_VALID_QUIRKS;
+ break;
default:
break;
}
@@ -4340,7 +4385,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
void __user *uaddr = (void __user*)(unsigned long)attr->addr;
if ((u64)(unsigned long)uaddr != attr->addr)
- return ERR_PTR(-EFAULT);
+ return ERR_PTR_USR(-EFAULT);
return uaddr;
}
@@ -5130,7 +5175,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
kvm->arch.last_tsc_offset == offset);
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
@@ -5875,6 +5920,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ break;
+ fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
@@ -5896,7 +5946,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
- kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -5975,15 +6025,18 @@ split_irqchip_unlock:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_copy_enc_context_from)
- r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_move_enc_context_from)
- r = kvm_x86_ops.vm_move_enc_context_from(
- kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
r = -EINVAL;
@@ -5999,6 +6052,18 @@ split_irqchip_unlock:
kvm->arch.exit_on_emulation_error = cap->args[0];
r = 0;
break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = -EINVAL;
+ if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
r = -EINVAL;
break;
@@ -6278,7 +6343,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
- kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -6458,8 +6523,10 @@ set_pit2_out:
break;
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_op)
- r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -6470,8 +6537,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_reg_region)
- r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -6482,8 +6551,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_unreg_region)
- r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -6514,7 +6585,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i;
- BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+ BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
"Please update the fixed PMCs in msrs_to_saved_all[]");
perf_get_x86_pmu_capability(&x86_pmu);
@@ -6663,7 +6734,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
static_call(kvm_x86_get_segment)(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
struct x86_exception *exception)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -6683,7 +6754,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
@@ -6693,7 +6764,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
@@ -6703,7 +6774,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
@@ -6719,7 +6790,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 access,
+ struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -6756,7 +6827,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -6781,7 +6852,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -6800,9 +6871,11 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception, bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = 0;
+ u64 access = 0;
- if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6818,7 +6891,7 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
}
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 access,
+ struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -6852,9 +6925,11 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = PFERR_WRITE_MASK;
+ u64 access = PFERR_WRITE_MASK;
- if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6921,7 +6996,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
bool write)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -7564,13 +7639,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
return;
}
-static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 *pdata)
+static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_get_msr(vcpu, msr_index, pdata);
+ r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
complete_emulated_rdmsr, r)) {
@@ -7581,13 +7656,13 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return r;
}
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
+static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_set_msr(vcpu, msr_index, data);
+ r = kvm_set_msr_with_filter(vcpu, msr_index, data);
if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
complete_emulated_msr_access, r)) {
@@ -7598,6 +7673,18 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return r;
}
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+}
+
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+}
+
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
@@ -7661,6 +7748,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -7731,6 +7823,8 @@ static const struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.get_smbase = emulator_get_smbase,
.set_smbase = emulator_set_smbase,
+ .set_msr_with_filter = emulator_set_msr_with_filter,
+ .get_msr_with_filter = emulator_get_msr_with_filter,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
@@ -7743,6 +7837,7 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_long_mode = emulator_guest_has_long_mode,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.exiting_smm = emulator_exiting_smm,
@@ -8411,8 +8506,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- if (kvm_x86_ops.update_emulated_instruction)
- static_call(kvm_x86_update_emulated_instruction)(vcpu);
+ static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -8811,6 +8905,12 @@ int kvm_arch_init(void *opaque)
goto out;
}
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
r = -ENOMEM;
x86_emulator_cache = kvm_alloc_emulator_cache();
@@ -8838,7 +8938,7 @@ int kvm_arch_init(void *opaque)
}
if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
@@ -8940,6 +9040,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
+ /*
+ * When tsc is in permanent catchup mode guests won't be able to use
+ * pvclock_read_retry loop to get consistent view of pvclock
+ */
+ if (vcpu->arch.tsc_always_catchup)
+ return -KVM_EOPNOTSUPP;
+
if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
@@ -8963,7 +9070,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
*
* @apicid - apicid of vcpu to be kicked.
*/
-static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
{
struct kvm_lapic_irq lapic_irq;
@@ -8983,15 +9090,29 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+
+static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ if (set)
+ __set_bit(reason, inhibits);
+ else
+ __clear_bit(reason, inhibits);
+
+ trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
+}
+
static void kvm_apicv_init(struct kvm *kvm)
{
+ unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
+
init_rwsem(&kvm->arch.apicv_update_lock);
- set_bit(APICV_INHIBIT_REASON_ABSENT,
- &kvm->arch.apicv_inhibit_reasons);
+ set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+
if (!enable_apicv)
- set_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
+ set_or_clear_apicv_inhibit(inhibits,
+ APICV_INHIBIT_REASON_ABSENT, true);
}
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@ -9082,7 +9203,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
break;
- kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
kvm_sched_yield(vcpu, a1);
ret = 0;
break;
@@ -9158,6 +9279,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
likely(!pic_in_kernel(vcpu->kvm));
}
+/* Called within kvm->srcu read side. */
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
@@ -9166,16 +9288,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- /*
- * The call to kvm_ready_for_interrupt_injection() may end up in
- * kvm_xen_has_interrupt() which may require the srcu lock to be
- * held, to protect against changes in the vcpu_info address.
- */
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9252,10 +9367,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
can_inject = false;
}
}
@@ -9335,7 +9450,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
@@ -9349,7 +9464,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
@@ -9671,25 +9786,21 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
-void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
{
unsigned long old, new;
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
+ if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
- if (activate)
- __clear_bit(bit, &new);
- else
- __set_bit(bit, &new);
+ set_or_clear_apicv_inhibit(&new, reason, set);
if (!!old != !!new) {
- trace_kvm_apicv_update_request(activate, bit);
/*
* Kick all vCPUs before setting apicv_inhibit_reasons to avoid
* false positives in the sanity check WARN in svm_vcpu_run().
@@ -9708,18 +9819,22 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
kvm_zap_gfn_range(kvm, gfn, gfn+1);
}
- } else
+ } else {
kvm->arch.apicv_inhibit_reasons = new;
+ }
}
-EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
{
+ if (!enable_apicv)
+ return;
+
down_write(&kvm->arch.apicv_update_lock);
- __kvm_request_apicv_update(kvm, activate, bit);
+ __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
up_write(&kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
+EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
@@ -9753,11 +9868,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
- static_call(kvm_x86_load_eoi_exitmap)(
+ static_call_cond(kvm_x86_load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
@@ -9780,10 +9895,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- if (!kvm_x86_ops.set_apic_access_page_addr)
- return;
-
- static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -9793,6 +9905,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
/*
+ * Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
* userspace.
@@ -9827,8 +9940,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
}
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
- kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
@@ -9973,7 +10086,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_guest_switch)(vcpu);
+ static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -9981,7 +10094,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* result in virtual interrupt delivery.
*/
local_irq_disable();
- vcpu->mode = IN_GUEST_MODE;
+
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -10041,6 +10156,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(0, 7);
}
+ guest_timing_enter_irqoff();
+
for (;;) {
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
@@ -10050,7 +10167,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
- exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10125,7 +10242,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* of accounting via context tracking, but the loss of accuracy is
* acceptable for all known use cases.
*/
- vtime_account_guest_exit();
+ guest_timing_exit_irqoff();
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
@@ -10167,6 +10284,7 @@ out:
return r;
}
+/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
bool hv_timer;
@@ -10226,12 +10344,12 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted);
}
+/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
@@ -10259,14 +10377,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
r = xfer_to_guest_mode_handle_work(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (r)
return r;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-
return r;
}
@@ -10353,10 +10469,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- /*
- * Exclude PKRU from restore as restored separately in
- * kvm_x86_ops.run().
- */
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
@@ -10372,6 +10485,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
+ struct kvm *kvm = vcpu->kvm;
int r;
vcpu_load(vcpu);
@@ -10379,6 +10493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -10389,7 +10504,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* use before KVM has ever run the vCPU.
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
goto out;
@@ -10449,8 +10568,9 @@ out:
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
- kvm_sigset_deactivate(vcpu);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -10539,16 +10659,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -10872,7 +10982,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
{
- bool inhibit = false;
+ bool set = false;
struct kvm_vcpu *vcpu;
unsigned long i;
@@ -10880,11 +10990,11 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
- inhibit = true;
+ set = true;
break;
}
}
- __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ);
+ __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
up_write(&kvm->arch.apicv_update_lock);
}
@@ -11321,15 +11431,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static_call(kvm_x86_update_exception_bitmap)(vcpu);
/*
- * Reset the MMU context if paging was enabled prior to INIT (which is
- * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
- * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
- * checked because it is unconditionally cleared on INIT and all other
- * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
- * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+ * On the standard CR0/CR4/EFER modification paths, there are several
+ * complex conditions determining whether the MMU has to be reset and/or
+ * which PCIDs have to be flushed. However, CR0.WP and the paging-related
+ * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
+ * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
+ * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
*/
- if (old_cr0 & X86_CR0_PG)
+ if (old_cr0 & X86_CR0_PG) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
kvm_mmu_reset_context(vcpu);
+ }
/*
* Intel's SDM states that all TLB entries are flushed on INIT. AMD's
@@ -11490,10 +11602,8 @@ int kvm_arch_hardware_setup(void *opaque)
u64 max = min(0x7fffffffULL,
__scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
kvm_max_guest_tsc_khz = max;
-
- kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
}
-
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
kvm_init_msr_list();
return 0;
}
@@ -11562,12 +11672,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
ret = kvm_page_track_init(kvm);
if (ret)
- return ret;
+ goto out;
+
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_page_track;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
- INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -11587,6 +11698,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.guest_can_read_msr_platform_info = true;
+ kvm->arch.enable_pmu = enable_pmu;
#if IS_ENABLED(CONFIG_HYPERV)
spin_lock_init(&kvm->arch.hv_root_tdp_lock);
@@ -11598,10 +11710,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
- kvm_mmu_init_vm(kvm);
kvm_xen_init_vm(kvm);
return static_call(kvm_x86_vm_init)(kvm);
+
+out_page_track:
+ kvm_page_track_cleanup(kvm);
+out:
+ return ret;
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -11639,8 +11755,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
-#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
-
/**
* __x86_set_memory_region: Setup KVM internal memory slot
*
@@ -11786,7 +11900,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
if (slot->arch.rmap[i])
continue;
- slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i]) {
memslot_rmap_free(slot);
return -ENOMEM;
@@ -11823,7 +11937,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
lpages = __kvm_mmu_slot_lpages(slot, npages, level);
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
+ linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -11973,6 +12087,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
return;
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
if (kvm_x86_ops.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
@@ -12017,8 +12134,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops.guest_apic_has_interrupt &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -12271,14 +12387,28 @@ static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+
+ if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
+ if (vcpu->arch.apf.send_user_only &&
+ static_call(kvm_x86_get_cpl)(vcpu) == 0)
return false;
- return true;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1 needs to opt into the special #PF vmexits that are
+ * used to deliver async page faults.
+ */
+ return vcpu->arch.apf.delivery_as_pf_vmexit;
+ } else {
+ /*
+ * Play it safe in case the guest temporarily disables paging.
+ * The real mode IDT in particular is unlikely to have a #PF
+ * exception setup.
+ */
+ return is_paging(vcpu);
+ }
}
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -12373,7 +12503,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_start_assignment)(kvm);
+ static_call_cond(kvm_x86_pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -12421,7 +12551,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -12446,7 +12576,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -12457,7 +12587,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
@@ -12511,7 +12641,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
struct x86_exception fault;
- u32 access = error_code &
+ u64 access = error_code &
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
@@ -12851,7 +12981,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 635b75f9e145..588792f00334 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -10,51 +10,6 @@
void kvm_spurious_fault(void);
-static __always_inline void kvm_guest_enter_irqoff(void)
-{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-static __always_inline void kvm_guest_exit_irqoff(void)
-{
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * context_tracking_guest_exit() restores host context and reinstates
- * RCU if enabled and required.
- *
- * This needs to be done immediately after VM-Exit, before any code
- * that might contain tracepoints or call out to the greater world,
- * e.g. before x86_spec_ctrl_restore_host().
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- context_tracking_guest_exit();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
-}
-
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
@@ -211,14 +166,9 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
}
-static inline u64 get_canonical(u64 la, u8 vaddr_bits)
-{
- return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
-}
-
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
{
- return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
@@ -352,6 +302,8 @@ extern int pi_inject_timer;
extern bool report_ignored_msrs;
+extern bool eager_page_split;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index bad57535fad0..bf6cc25eee76 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -39,8 +39,8 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
}
do {
- ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true,
- gpa, PAGE_SIZE, false);
+ ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
+ gpa, PAGE_SIZE);
if (ret)
goto out;
@@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
+ struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ bool atomic = (state == RUNSTATE_runnable);
uint64_t state_entry_time;
- unsigned int offset;
+ int __user *user_state;
+ uint64_t __user *user_times;
kvm_xen_update_runstate(v, state);
if (!vx->runstate_set)
return;
- BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+ if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+ kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+ return;
+
+ /* We made sure it fits in a single page */
+ BUG_ON(!ghc->memslot);
+
+ if (atomic)
+ pagefault_disable();
- offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
/*
- * The only difference is alignment of uint64_t in 32-bit.
- * So the first field 'state' is accessed directly using
- * offsetof() (where its offset happens to be zero), while the
- * remaining fields which are all uint64_t, start at 'offset'
- * which we tweak here by adding 4.
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct us the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'.
+ *
+ * So we use 'int __user *user_state' to point to the state field,
+ * and 'uint64_t __user *user_times' for runstate_entry_time. So
+ * the actual array of time[] in each state starts at user_times[1].
*/
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ user_state = (int __user *)ghc->hva;
+
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time));
+#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
if (v->kvm->arch.xen.long_mode)
- offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct vcpu_runstate_info,
+ state_entry_time));
#endif
/*
* First write the updated state_entry_time at the appropriate
@@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ if (__put_user(state_entry_time, user_times))
+ goto out;
smp_wmb();
/*
@@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->current_runstate,
- offsetof(struct vcpu_runstate_info, state),
- sizeof(vx->current_runstate)))
- return;
+ if (__put_user(vx->current_runstate, user_state))
+ goto out;
/*
* Write the actual runstate times immediately after the
@@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->runstate_times[0],
- offset + sizeof(u64),
- sizeof(vx->runstate_times)))
- return;
-
+ if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+ goto out;
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
-
state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ __put_user(state_entry_time, user_times);
+ smp_wmb();
+
+ out:
+ mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+ if (atomic)
+ pagefault_enable();
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
data->u.gpa,
@@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
data->u.gpa,
@@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
data->u.gpa,
@@ -695,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -830,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
@@ -988,8 +1025,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
break;
idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
- PAGE_SIZE, false);
+ rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);