summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c20
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c20
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c17
-rw-r--r--arch/x86/kvm/svm/nested.c23
-rw-r--r--arch/x86/kvm/svm/sev.c32
-rw-r--r--arch/x86/kvm/svm/svm.c105
-rw-r--r--arch/x86/kvm/svm/svm.h39
-rw-r--r--arch/x86/kvm/vmx/capabilities.h3
-rw-r--r--arch/x86/kvm/vmx/nested.c29
-rw-r--r--arch/x86/kvm/vmx/vmx.c275
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c164
-rw-r--r--arch/x86/kvm/x86.h45
16 files changed, 437 insertions, 352 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 19606a341888..9a48f138832d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
- F(SGX_LC)
+ F(SGX_LC) | F(BUS_LOCK_DETECT)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
@@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void)
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN)
);
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
@@ -637,7 +652,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0;
- entry->ecx = F(RDPID);
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = F(RDPID);
++array->nent;
default:
break;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 77e1c89a95a7..8a0ccdb56076 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4502,7 +4502,7 @@ static const struct opcode group8[] = {
* from the register case of group9.
*/
static const struct gprefix pfx_0f_c7_7 = {
- N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
};
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 0d359115429a..f016838faedd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -468,6 +468,7 @@ enum x86_intercept {
x86_intercept_clgi,
x86_intercept_skinit,
x86_intercept_rdtscp,
+ x86_intercept_rdpid,
x86_intercept_icebp,
x86_intercept_wbinvd,
x86_intercept_monitor,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 152591f9243a..c0ebef560bd1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1913,8 +1913,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (!apic->lapic_timer.hv_timer_in_use)
goto out;
WARN_ON(rcuwait_active(&vcpu->wait));
- cancel_hv_timer(apic);
apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4b3ee244ebe0..0144c40d09c7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
- if (WARN_ON_ONCE(!mmu->lm_root)) {
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
- mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
}
for (i = 0; i < 4; ++i) {
@@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
- mmu->root_hpa = __pa(mmu->lm_root);
+ mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3350,7 +3350,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *lm_root, *pae_root;
+ u64 *pml4_root, *pae_root;
/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
return -EIO;
- if (mmu->pae_root && mmu->lm_root)
+ if (mmu->pae_root && mmu->pml4_root)
return 0;
/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
return -EIO;
/*
@@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root)
return -ENOMEM;
- lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!lm_root) {
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root) {
free_page((unsigned long)pae_root);
return -ENOMEM;
}
mmu->pae_root = pae_root;
- mmu->lm_root = lm_root;
+ mmu->pml4_root = pml4_root;
return 0;
}
@@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
if (!tdp_enabled && mmu->pae_root)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
- free_page((unsigned long)mmu->lm_root);
+ free_page((unsigned long)mmu->pml4_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 88f69a6cc492..95eeb5ac6a8a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
}
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
+ if (is_large_pte(old_spte))
+ atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
+ else
+ atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
+ }
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -1009,6 +1016,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
if (!is_shadow_present_pte(iter.old_spte)) {
+ /*
+ * If SPTE has been forzen by another thread, just
+ * give up and retry, avoiding unnecessary page table
+ * allocation and free.
+ */
+ if (is_removed_spte(iter.old_spte))
+ break;
+
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
child_pt = sp->spt;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 540d43ba2cf4..5e8d8443154e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
- WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
/*
* On vmexit the GIF is set to false and
@@ -872,6 +871,15 @@ void svm_free_nested(struct vcpu_svm *svm)
__free_page(virt_to_page(svm->nested.vmcb02.ptr));
svm->nested.vmcb02.ptr = NULL;
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
svm->nested.initialized = false;
}
@@ -884,9 +892,11 @@ void svm_leave_nested(struct vcpu_svm *svm)
if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
leave_guest_mode(vcpu);
- svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ svm_switch_vmcb(svm, &svm->vmcb01);
nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
@@ -1298,12 +1308,17 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* L2 registers if needed are moved from the current VMCB to VMCB02.
*/
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(svm);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- if (svm->current_vmcb == &svm->vmcb01)
- svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
svm->vmcb01.ptr->save.es = save->es;
svm->vmcb01.ptr->save.cs = save->cs;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1356ee095cd5..5bc887e9a986 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
}
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user dst_uaddr,
+ void __user *dst_uaddr,
unsigned long dst_paddr,
int size, int *err)
{
@@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (tpage) {
offset = paddr & 15;
- if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
- page_address(tpage) + offset, size))
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
ret = -EFAULT;
}
@@ -800,9 +799,9 @@ e_free:
}
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user vaddr,
+ void __user *vaddr,
unsigned long dst_paddr,
- unsigned long __user dst_vaddr,
+ void __user *dst_vaddr,
int size, int *error)
{
struct page *src_tpage = NULL;
@@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */
- if (!IS_ALIGNED(vaddr, 16)) {
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL);
if (!src_tpage)
return -ENOMEM;
- if (copy_from_user(page_address(src_tpage),
- (void __user *)(uintptr_t)vaddr, size)) {
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
__free_page(src_tpage);
return -EFAULT;
}
@@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
* - copy the source buffer in an intermediate buffer
* - use the intermediate buffer as source buffer
*/
- if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL);
@@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
page_address(src_tpage), size);
else {
if (copy_from_user(page_address(dst_tpage) + dst_offset,
- (void __user *)(uintptr_t)vaddr, size)) {
+ vaddr, size)) {
ret = -EFAULT;
goto e_free;
}
@@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (dec)
ret = __sev_dbg_decrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
__sme_page_pa(dst_p[0]) + d_off,
len, &argp->error);
else
ret = __sev_dbg_encrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- vaddr,
+ (void __user *)vaddr,
__sme_page_pa(dst_p[0]) + d_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
len, &argp->error);
sev_unpin_memory(kvm, src_p, n);
@@ -1764,7 +1762,8 @@ e_mirror_unlock:
e_source_unlock:
mutex_unlock(&source_kvm->lock);
e_source_put:
- fput(source_kvm_file);
+ if (source_kvm_file)
+ fput(source_kvm_file);
return ret;
}
@@ -2198,7 +2197,7 @@ vmgexit_err:
return -EINVAL;
}
-static void pre_sev_es_run(struct vcpu_svm *svm)
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
if (!svm->ghcb)
return;
@@ -2234,9 +2233,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
- /* Perform any SEV-ES pre-run actions */
- pre_sev_es_run(svm);
-
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9790c73f2a32..05eca131eaf2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
* RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
* defer the restoration of TSC_AUX until the CPU returns to userspace.
*/
-#define TSC_AUX_URET_SLOT 0
+static int tsc_aux_uret_slot __read_mostly = -1;
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
@@ -447,6 +447,11 @@ static int has_svm(void)
return 0;
}
+ if (pgtable_l5_enabled()) {
+ pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
+ return 0;
+ }
+
return 1;
}
@@ -858,8 +863,8 @@ static __init void svm_adjust_mmio_mask(void)
return;
/* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
@@ -959,8 +964,7 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32;
}
- if (boot_cpu_has(X86_FEATURE_RDTSCP))
- kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@@ -1100,7 +1104,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
return svm->vmcb->control.tsc_offset;
}
-static void svm_check_invpcid(struct vcpu_svm *svm)
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
{
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
@@ -1113,6 +1119,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
}
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
}
static void init_vmcb(struct kvm_vcpu *vcpu)
@@ -1235,8 +1248,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
- svm->nested.vmcb12_gpa = 0;
- svm->nested.last_vmcb12_gpa = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
vcpu->arch.hflags = 0;
if (!kvm_pause_in_guest(vcpu->kvm)) {
@@ -1248,7 +1261,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_check_invpcid(svm);
+ svm_recalc_instruction_intercepts(vcpu, svm);
/*
* If the host supports V_SPEC_CTRL then disable the interception
@@ -1424,6 +1437,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
if (svm->guest_state_loaded)
return;
@@ -1445,8 +1461,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
}
}
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
+ if (likely(tsc_aux_uret_slot >= 0))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
}
@@ -2655,11 +2671,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
msr_info->data = svm->tsc_aux;
break;
/*
@@ -2876,30 +2887,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
-
- if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
-
- /*
- * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
- * incomplete and conflicting architectural behavior. Current
- * AMD CPUs completely ignore bits 63:32, i.e. they aren't
- * reserved and always read as zeros. Emulate AMD CPU behavior
- * to avoid explosions if the vCPU is migrated from an AMD host
- * to an Intel host.
- */
- data = (u32)data;
-
/*
* TSC_AUX is usually changed only during boot and never read
* directly. Intercept TSC_AUX instead of exposing it to the
* guest via direct_access_msrs, and switch it via user return.
*/
preempt_disable();
- r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+ r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable();
if (r)
return 1;
@@ -3084,6 +3078,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
[SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
[SVM_EXIT_MONITOR] = kvm_emulate_monitor,
[SVM_EXIT_MWAIT] = kvm_emulate_mwait,
@@ -3710,25 +3705,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long vmcb_pa = svm->current_vmcb->pa;
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ kvm_guest_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
__svm_sev_es_vcpu_run(vmcb_pa);
@@ -3748,24 +3725,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
vmload(__sme_page_pa(sd->save_area));
}
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ kvm_guest_exit_irqoff();
}
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@ -4007,8 +3967,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
- /* Check again if INVPCID interception if required */
- svm_check_invpcid(svm);
+ svm_recalc_instruction_intercepts(vcpu, svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 84b3133c2251..2c9ece618b29 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -20,6 +20,7 @@
#include <linux/bits.h>
#include <asm/svm.h>
+#include <asm/sev-common.h>
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
@@ -525,40 +526,9 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-#define GHCB_MSR_INFO_POS 0
-#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
-
-#define GHCB_MSR_SEV_INFO_RESP 0x001
-#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
- GHCB_MSR_SEV_INFO_RESP)
-
-#define GHCB_MSR_CPUID_REQ 0x004
-#define GHCB_MSR_CPUID_RESP 0x005
-#define GHCB_MSR_CPUID_FUNC_POS 32
-#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
-#define GHCB_MSR_CPUID_VALUE_POS 32
-#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
-#define GHCB_MSR_CPUID_REG_POS 30
-#define GHCB_MSR_CPUID_REG_MASK 0x3
-
-#define GHCB_MSR_TERM_REQ 0x100
-#define GHCB_MSR_TERM_REASON_SET_POS 12
-#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
-#define GHCB_MSR_TERM_REASON_POS 16
-#define GHCB_MSR_TERM_REASON_MASK 0xff
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
extern unsigned int max_sev_asid;
@@ -581,6 +551,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d1d77985e889..8dee8a5fbc17 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -398,6 +398,9 @@ static inline u64 vmx_supported_debugctl(void)
{
u64 debugctl = 0;
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
debugctl |= DEBUGCTLMSR_LBR_MASK;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bced76637823..6058a65a6ede 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL ||
- evmptrld_status == EVMPTRLD_ERROR) {
- pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
- __func__);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ evmptrld_status == EVMPTRLD_ERROR)
return false;
- }
}
return true;
@@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- if (!nested_get_evmcs_page(vcpu))
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
return false;
+ }
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
return false;
@@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* Similarly, triple faults in L2 should never escape. */
WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
/* Service the TLB flush request for L2 before switching to L1. */
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cbe0cdade38a..4bceb5ca3a89 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
@@ -454,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_uret_msrs_list[], even in i386 builds.
- */
-static const u32 vmx_uret_msrs_list[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
- MSR_IA32_TSX_CTRL,
-};
-
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -696,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
return r;
}
-static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nr_uret_msrs; ++i)
- if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
- return i;
- return -1;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __vmx_find_uret_msr(vmx, msr);
+ i = kvm_find_user_return_msr(msr);
if (i >= 0)
return &vmx->guest_uret_msrs[i];
return NULL;
@@ -719,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
struct vmx_uret_msr *msr, u64 data)
{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
u64 old_msr_data = msr->data;
msr->data = data;
- if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
@@ -1077,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
return false;
}
- i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0)
return false;
@@ -1239,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true;
- for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
- kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
vmx->guest_uret_msrs[i].data,
vmx->guest_uret_msrs[i].mask);
-
+ }
}
if (vmx->nested.need_vmcs12_to_shadow_sync)
@@ -1750,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
{
- struct vmx_uret_msr tmp;
- int from, to;
+ struct vmx_uret_msr *uret_msr;
- from = __vmx_find_uret_msr(vmx, msr);
- if (from < 0)
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
return;
- to = vmx->nr_active_uret_msrs++;
- tmp = vmx->guest_uret_msrs[to];
- vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
- vmx->guest_uret_msrs[from] = tmp;
+ uret_msr->load_into_hardware = load_into_hardware;
}
/*
@@ -1772,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- vmx->guest_uret_msrs_loaded = false;
- vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- vmx_setup_uret_msr(vmx, MSR_STAR);
- vmx_setup_uret_msr(vmx, MSR_LSTAR);
- vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
- }
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
- if (update_transition_efer(vmx))
- vmx_setup_uret_msr(vmx, MSR_EFER);
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
- if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
- vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
}
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1992,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- goto find_uret_msr;
case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
break;
@@ -2030,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
if (!intel_pmu_lbr_is_enabled(vcpu))
debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+
return debugctl;
}
@@ -2312,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- goto find_uret_msr;
case MSR_IA32_PERF_CAPABILITIES:
if (data && !vcpu_to_pmu(vcpu)->version)
return 1;
@@ -4368,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
xsaves_enabled, false);
}
- vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -6415,18 +6411,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+ unsigned long entry)
{
- unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
kvm_before_interrupt(vcpu);
- vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
+ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6437,18 +6432,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6662,25 +6659,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ kvm_guest_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6696,24 +6675,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ kvm_guest_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6888,6 +6850,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
+ struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
int i, cpu, err;
@@ -6910,43 +6873,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
-
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
- u32 index = vmx_uret_msrs_list[i];
- u32 data_low, data_high;
- int j = vmx->nr_uret_msrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
-
- vmx->guest_uret_msrs[j].slot = i;
- vmx->guest_uret_msrs[j].data = 0;
- switch (index) {
- case MSR_IA32_TSX_CTRL:
- /*
- * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
- * interception. Keep the host value unchanged to avoid
- * changing CPUID bits under the host kernel's feet.
- *
- * hle=0, rtm=0, tsx_ctrl=1 can be found with some
- * combinations of new kernel and old userspace. If
- * those guests run on a tsx=off host, do allow guests
- * to use TSX_CTRL, but do not change the value on the
- * host so that TSX remains always disabled.
- */
- if (boot_cpu_has(X86_FEATURE_RTM))
- vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
- else
- vmx->guest_uret_msrs[j].mask = 0;
- break;
- default:
- vmx->guest_uret_msrs[j].mask = -1ull;
- break;
- }
- ++vmx->nr_uret_msrs;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ vmx->guest_uret_msrs[i].data = 0;
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ }
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -7377,9 +7316,11 @@ static __init void vmx_set_cpu_caps(void)
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
- /* CPUID 0x80000001 */
- if (!cpu_has_vmx_rdtscp())
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@ -7435,8 +7376,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
*/
- case x86_intercept_rdtscp:
+ case x86_intercept_rdpid:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
@@ -7802,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i, ept_lpage_level;
+ int r, ept_lpage_level;
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
- kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+ vmx_setup_user_return_msrs();
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 008cb87ff088..16e4e457ba23 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -36,7 +36,7 @@ struct vmx_msrs {
};
struct vmx_uret_msr {
- unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
+ bool load_into_hardware;
u64 data;
u64 mask;
};
@@ -245,8 +245,16 @@ struct vcpu_vmx {
u32 idt_vectoring_info;
ulong rflags;
+ /*
+ * User return MSRs are always emulated when enabled in the guest, but
+ * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+ * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+ * be loaded into hardware if those conditions aren't met.
+ * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
+ * into hardware when running the guest. guest_uret_msrs[] is resorted
+ * whenever the number of "active" uret MSRs is modified.
+ */
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
- int nr_uret_msrs;
int nr_active_uret_msrs;
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cebdaa1e3cf5..bbc4e04e67ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
*/
#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_user_return_msrs_global {
- int nr;
- u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
@@ -198,7 +193,9 @@ struct kvm_user_return_msrs {
} values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@ -330,23 +327,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+ wrmsrl(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
+{
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
+}
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
{
- BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
- user_return_msrs_global.msrs[slot] = msr;
- if (slot >= user_return_msrs_global.nr)
- user_return_msrs_global.nr = slot + 1;
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
}
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
@@ -355,8 +382,8 @@ static void kvm_user_return_msr_cpu_online(void)
u64 value;
int i;
- for (i = 0; i < user_return_msrs_global.nr; ++i) {
- rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrl_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value;
msrs->values[i].curr = value;
}
@@ -371,7 +398,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
if (err)
return 1;
@@ -1149,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1615,6 +1645,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* invokes 64-bit SYSENTER.
*/
data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if and only if the guest CPU is Intel, and clear
+ * the bits in all other cases. This ensures cross-vendor
+ * migration will provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
}
msr.data = data;
@@ -1651,6 +1705,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ }
+
msr.index = index;
msr.host_initiated = host_initiated;
@@ -3402,7 +3468,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
- case MSR_K8_SYSCFG:
+ case MSR_AMD64_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
case MSR_VM_HSAVE_PA:
@@ -5468,14 +5534,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
- int r;
if (!user_range->nmsrs)
return 0;
+ if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
return -EINVAL;
@@ -5484,31 +5554,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- range = (struct msr_bitmap_range) {
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
.flags = user_range->flags,
.base = user_range->base,
.nmsrs = user_range->nmsrs,
.bitmap = bitmap,
};
- if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
- r = -EINVAL;
- goto err;
- }
-
- if (!range.flags) {
- r = -EINVAL;
- goto err;
- }
-
- /* Everything ok, add this range identifier. */
- msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++;
-
return 0;
-err:
- kfree(bitmap);
- return r;
}
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@ -5937,7 +5991,8 @@ static void kvm_init_msr_list(void)
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
continue;
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -8040,6 +8095,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -8050,13 +8117,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -8118,6 +8186,7 @@ int kvm_arch_init(void *opaque)
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
+ kvm_nr_uret_msrs = 0;
r = kvm_mmu_module_init();
if (r)
@@ -8168,6 +8237,8 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
@@ -9315,6 +9386,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_disable();
kvm_after_interrupt(vcpu);
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ vtime_account_guest_exit();
+
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8ddd38146525..521f74e5bbf2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,51 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+static __always_inline void kvm_guest_enter_irqoff(void)
+{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+static __always_inline void kvm_guest_exit_irqoff(void)
+{
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * context_tracking_guest_exit() restores host context and reinstates
+ * RCU if enabled and required.
+ *
+ * This needs to be done immediately after VM-Exit, before any code
+ * that might contain tracepoints or call out to the greater world,
+ * e.g. before x86_spec_ctrl_restore_host().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ context_tracking_guest_exit();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \