summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/hyperv.c27
-rw-r--r--arch/x86/kvm/hyperv.h2
-rw-r--r--arch/x86/kvm/lapic.c40
-rw-r--r--arch/x86/kvm/mmu.c531
-rw-r--r--arch/x86/kvm/mmu.h24
-rw-r--r--arch/x86/kvm/paging_tmpl.h28
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/vmx.c1120
-rw-r--r--arch/x86/kvm/x86.c108
11 files changed, 1555 insertions, 342 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7e042e3d47fd..7bcfa61375c0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
(1 << KVM_FEATURE_PV_UNHALT) |
(1 << KVM_FEATURE_PV_TLB_FLUSH) |
- (1 << KVM_FEATURE_ASYNC_PF_VMEXIT);
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI);
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4c4f4263420c..106482da6388 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
maxphyaddr = 36;
rsvd = rsvd_bits(maxphyaddr, 63);
if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
- rsvd &= ~CR3_PCID_INVD;
+ rsvd &= ~X86_CR3_PCID_NOFLUSH;
}
if (new_val & rsvd)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index af8caf965baa..01d209ab5481 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
int ret;
- if (!synic->active)
+ if (!synic->active && !host)
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
return ret;
}
-static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
{
int ret;
- if (!synic->active)
+ if (!synic->active && !host)
return 1;
ret = 0;
@@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
case HV_X64_MSR_TSC_EMULATION_STATUS:
hv->hv_tsc_emulation_status = data;
break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
data, host);
}
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
{
u64 data = 0;
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
@@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return kvm_hv_set_msr(vcpu, msr, data, host);
}
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
- return kvm_hv_get_msr(vcpu, msr, pdata);
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 837465d69c6d..d6aa969e20f1 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
}
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
bool kvm_hv_hypercall_enabled(struct kvm *kvm);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d536d457517b..0cefba28c864 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
irq->level, irq->trig_mode, dest_map);
}
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, int min,
+ unsigned long icr, int op_64_bit)
+{
+ int i;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count = 0;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ if (icr & APIC_DEST_MASK)
+ return -KVM_EINVAL;
+ if (icr & APIC_SHORT_MASK)
+ return -KVM_EINVAL;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ /* Bits above cluster_size are masked in the caller. */
+ for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+
+ min += cluster_size;
+ for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+
+ rcu_read_unlock();
+ return count;
+}
+
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a44e568363a4..a282321329b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator {
unsigned index;
};
-#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+static const union kvm_mmu_page_role mmu_base_role_mask = {
+ .cr0_wp = 1,
+ .cr4_pae = 1,
+ .nxe = 1,
+ .smep_andnot_wp = 1,
+ .smap_andnot_wp = 1,
+ .smm = 1,
+ .guest_mode = 1,
+ .ad_disabled = 1,
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
@@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
PT64_EPT_EXECUTABLE_MASK;
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
static void mmu_spte_set(u64 *sptep, u64 spte);
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
@@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
{
unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 mask = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
access &= ACC_WRITE_MASK | ACC_USER_MASK;
- mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
+ mask |= shadow_mmio_value | access;
+ mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+ mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << shadow_nonpresent_or_rsvd_mask_len;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
@@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
- return (spte & ~mask) >> PAGE_SHIFT;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
+ shadow_nonpresent_or_rsvd_mask;
+ u64 gpa = spte & ~mask;
+
+ gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
@@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_reset_all_pte_masks(void)
{
shadow_user_mask = 0;
shadow_accessed_mask = 0;
@@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void)
shadow_mmio_mask = 0;
shadow_present_mask = 0;
shadow_acc_track_mask = 0;
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ */
+ if (boot_cpu_data.x86_phys_bits <
+ 52 - shadow_nonpresent_or_rsvd_mask_len)
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(boot_cpu_data.x86_phys_bits -
+ shadow_nonpresent_or_rsvd_mask_len,
+ boot_cpu_data.x86_phys_bits - 1);
}
static int is_cpuid_PSE36(void)
@@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
return 0;
}
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
{
}
@@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+ if (sp->role.cr4_pae != !!is_pae(vcpu)
+ || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2392,11 +2440,12 @@ out:
return sp;
}
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
- struct kvm_vcpu *vcpu, u64 addr)
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
{
iterator->addr = addr;
- iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+ iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu.shadow_root_level;
if (iterator->level == PT64_ROOT_4LEVEL &&
@@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
--iterator->level;
if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu.root_hpa);
+
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
@@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
}
}
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+ addr);
+}
+
static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
@@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_unsync_page(vcpu, sp);
}
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
+ * Since it is false, so it just returns.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. The implicit barrier in 2.2
+ * pairs with this write barrier.
+ */
+ smp_wmb();
+
return false;
}
@@ -2724,6 +2825,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
return true;
}
+/* Bits which may be returned by set_spte() */
+#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
@@ -2800,7 +2905,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
- ret = 1;
+ ret |= SET_SPTE_WRITE_PROTECTED_PT;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
@@ -2816,7 +2921,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
if (mmu_spte_update(sptep, spte))
- kvm_flush_remote_tlbs(vcpu->kvm);
+ ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
done:
return ret;
}
@@ -2827,7 +2932,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
{
int was_rmapped = 0;
int rmap_count;
+ int set_spte_ret;
int ret = RET_PF_RETRY;
+ bool flush = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2844,22 +2951,25 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = true;
} else
was_rmapped = 1;
}
- if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
- true, host_writable)) {
+ set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
+ speculative, true, host_writable);
+ if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
if (write_fault)
ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
@@ -3358,26 +3468,47 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
*root_hpa = INVALID_PAGE;
}
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
{
int i;
LIST_HEAD(invalid_list);
struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
- if (!VALID_PAGE(mmu->root_hpa))
- return;
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+ mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
+ &invalid_list);
+ } else {
+ for (i = 0; i < 4; ++i)
+ if (mmu->pae_root[i] != 0)
+ mmu_free_root_page(vcpu->kvm,
+ &mmu->pae_root[i],
+ &invalid_list);
+ mmu->root_hpa = INVALID_PAGE;
+ }
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3546,7 +3677,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
return mmu_alloc_shadow_roots(vcpu);
}
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
@@ -3558,14 +3689,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+
sp = page_header(root);
+
+ /*
+ * Even if another CPU was marking the SP as unsync-ed
+ * simultaneously, any guest page table changes are not
+ * guaranteed to be visible anyway until this VCPU issues a TLB
+ * flush strictly after those changes are made. We only need to
+ * ensure that the other CPU sets these flags before any actual
+ * changes to the page tables are made. The comments in
+ * mmu_need_write_protect() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ if (!smp_load_acquire(&sp->unsync) &&
+ !smp_load_acquire(&sp->unsync_children))
+ return;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
mmu_sync_children(vcpu, sp);
+
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -3575,13 +3731,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-}
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
spin_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
@@ -3948,16 +4099,107 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->nx = false;
}
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+ struct kvm_mmu_root_info root;
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ root.cr3 = mmu->get_cr3(vcpu);
+ root.hpa = mmu->root_hpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ swap(root, mmu->prev_roots[i]);
+
+ if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+ page_header(root.hpa) != NULL &&
+ new_role.word == page_header(root.hpa)->role.word)
+ break;
+ }
+
+ mmu->root_hpa = root.hpa;
+
+ return i < KVM_MMU_NUM_PREV_ROOTS;
+}
+
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
{
- kvm_mmu_free_roots(vcpu);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ /*
+ * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
+ * later if necessary.
+ */
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
+ return false;
+
+ if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU
+ * generation number. However, that is accompanied by
+ * KVM_REQ_MMU_RELOAD, which will free the root that we
+ * have set here and allocate a new one.
+ */
+
+ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+ if (!skip_tlb_flush) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the
+ * VCPU. When switching to a new CR3, that GVA->GPA
+ * mapping may no longer be valid. So clear any cached
+ * MMIO info even when we don't need to sync the shadow
+ * page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ __clear_sp_write_flooding_count(
+ page_header(mmu->root_hpa));
+
+ return true;
+ }
+ }
+
+ return false;
}
+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+}
+
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+{
+ __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
return kvm_read_cr3(vcpu);
@@ -4432,7 +4674,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
}
@@ -4462,7 +4703,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
}
@@ -4472,20 +4712,32 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.ad_disabled = (shadow_accessed_mask == 0);
+ role.level = kvm_x86_ops->get_tdp_level(vcpu);
+ role.direct = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
- context->base_role.word = 0;
- context->base_role.guest_mode = is_guest_mode(vcpu);
- context->base_role.smm = is_smm(vcpu);
- context->base_role.ad_disabled = (shadow_accessed_mask == 0);
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_tdp_mmu_root_page_role(vcpu).word;
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
- context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
context->get_cr3 = get_cr3;
@@ -4520,13 +4772,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+static union kvm_mmu_page_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
{
+ union kvm_mmu_page_role role = {0};
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
- struct kvm_mmu *context = &vcpu->arch.mmu;
- MMU_WARN_ON(VALID_PAGE(context->root_hpa));
+ role.nxe = is_nx(vcpu);
+ role.cr4_pae = !!is_pae(vcpu);
+ role.cr0_wp = is_write_protection(vcpu);
+ role.smep_andnot_wp = smep && !is_write_protection(vcpu);
+ role.smap_andnot_wp = smap && !is_write_protection(vcpu);
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.direct = !is_paging(vcpu);
+ role.access = ACC_ALL;
+
+ if (!is_long_mode(vcpu))
+ role.level = PT32E_ROOT_LEVEL;
+ else if (is_la57_mode(vcpu))
+ role.level = PT64_ROOT_5LEVEL;
+ else
+ role.level = PT64_ROOT_4LEVEL;
+
+ return role;
+}
+
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -4537,26 +4812,34 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
else
paging32_init_context(vcpu, context);
- context->base_role.nxe = is_nx(vcpu);
- context->base_role.cr4_pae = !!is_pae(vcpu);
- context->base_role.cr0_wp = is_write_protection(vcpu);
- context->base_role.smep_andnot_wp
- = smep && !is_write_protection(vcpu);
- context->base_role.smap_andnot_wp
- = smap && !is_write_protection(vcpu);
- context->base_role.guest_mode = is_guest_mode(vcpu);
- context->base_role.smm = is_smm(vcpu);
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_shadow_mmu_root_page_role(vcpu).word;
reset_shadow_zero_bits_mask(vcpu, context);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+static union kvm_mmu_page_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+
+ role.level = PT64_ROOT_4LEVEL;
+ role.direct = false;
+ role.ad_disabled = !accessed_dirty;
+ role.guest_mode = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty)
+ bool accessed_dirty, gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ union kvm_mmu_page_role root_page_role =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
- MMU_WARN_ON(VALID_PAGE(context->root_hpa));
-
+ __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
context->shadow_root_level = PT64_ROOT_4LEVEL;
context->nx = true;
@@ -4567,10 +4850,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->invlpg = ept_invlpg;
context->update_pte = ept_update_pte;
context->root_level = PT64_ROOT_4LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
- context->base_role.ad_disabled = !accessed_dirty;
- context->base_role.guest_mode = 1;
+ context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
update_last_nonleaf_level(vcpu, context);
@@ -4633,8 +4914,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
update_last_nonleaf_level(vcpu, g_context);
}
-static void init_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
{
+ if (reset_roots) {
+ uint i;
+
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ }
+
if (mmu_is_nested(vcpu))
init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
@@ -4642,11 +4932,21 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
else
init_kvm_softmmu(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
+
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+{
+ if (tdp_enabled)
+ return kvm_calc_tdp_mmu_root_page_role(vcpu);
+ else
+ return kvm_calc_shadow_mmu_root_page_role(vcpu);
+}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- init_kvm_mmu(vcpu);
+ kvm_init_mmu(vcpu, true);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -4661,8 +4961,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
if (r)
goto out;
- /* set_cr3() should ensure TLB has been flushed */
- vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ kvm_mmu_load_cr3(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
out:
return r;
}
@@ -4670,7 +4970,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu);
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4823,16 +5123,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 entry, gentry, *spte;
int npte;
bool remote_flush, local_flush;
- union kvm_mmu_page_role mask = { };
-
- mask.cr0_wp = 1;
- mask.cr4_pae = 1;
- mask.nxe = 1;
- mask.smep_andnot_wp = 1;
- mask.smap_andnot_wp = 1;
- mask.smm = 1;
- mask.guest_mode = 1;
- mask.ad_disabled = 1;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4876,7 +5166,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_page_zap_pte(vcpu->kvm, sp, spte);
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
- & mask.word) && rmap_can_add(vcpu))
+ & mmu_base_role_mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (need_remote_flush(entry, *spte))
remote_flush = true;
@@ -5001,12 +5291,67 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ int i;
+
+ /* INVLPG on a * non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(gva, vcpu))
+ return;
+
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Since it would take us roughly similar amount
+ * of work to determine whether any of the prev_root mappings of the VA
+ * is marked global, or to just sync it blindly, so we might as well
+ * just always sync it.
+ *
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (VALID_PAGE(mmu->prev_roots[i].hpa))
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool tlb_flush = false;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu)) {
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ tlb_flush = true;
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ tlb_flush = true;
+ }
+ }
+
+ if (tlb_flush)
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
+
void kvm_enable_tdp(void)
{
tdp_enabled = true;
@@ -5030,6 +5375,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
struct page *page;
int i;
+ if (tdp_enabled)
+ return 0;
+
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -5048,11 +5396,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
+ uint i;
+
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
vcpu->arch.mmu.translate_gpa = translate_gpa;
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
return alloc_mmu_pages(vcpu);
}
@@ -5060,7 +5413,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
- init_kvm_mmu(vcpu);
+ kvm_init_mmu(vcpu, true);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
@@ -5500,7 +5853,7 @@ int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
- kvm_mmu_clear_all_pte_masks();
+ kvm_mmu_reset_all_pte_masks();
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b408c0ad612..1fab69c0b2f3 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -61,9 +61,10 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty);
+ bool accessed_dirty, gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
@@ -85,6 +86,27 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
+{
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
+ kvm_get_active_pcid(vcpu));
+}
+
/*
* Currently, we have two sorts of write-protection, a) the first one
* write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6288e9d7068e..14ffd973df54 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -181,7 +181,7 @@ no_present:
* set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
* to signify readability since it isn't used in the EPT case
*/
-static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+static inline unsigned FNAME(gpte_access)(u64 gpte)
{
unsigned access;
#if PTTYPE == PTTYPE_EPT
@@ -394,8 +394,8 @@ retry_walk:
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
- walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+ walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
goto error;
@@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
*/
mmu_topup_memory_caches(vcpu);
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ if (!VALID_PAGE(root_hpa)) {
WARN_ON(1);
return;
}
spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, gva, iterator) {
+ for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
level = iterator.level;
sptep = iterator.sptep;
@@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
int i, nr_present = 0;
bool host_writable;
gpa_t first_pte_gpa;
+ int set_spte_ret = 0;
/* direct kvm_mmu_page can not be unsync. */
BUG_ON(sp->role.direct);
@@ -1002,7 +1003,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(vcpu, gpte);
+ pte_access &= FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
@@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
- set_spte(vcpu, &sp->spt[i], pte_access,
- PT_PAGE_TABLE_LEVEL, gfn,
- spte_to_pfn(sp->spt[i]), true, false,
- host_writable);
+ set_spte_ret |= set_spte(vcpu, &sp->spt[i],
+ pte_access, PT_PAGE_TABLE_LEVEL,
+ gfn, spte_to_pfn(sp->spt[i]),
+ true, false, host_writable);
}
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
return nr_present;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..73e27a98456f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
- svm_flush_tlb(vcpu, true);
}
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -5435,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
svm->asid_generation--;
}
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
}
@@ -5766,7 +5772,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_CR);
- svm_flush_tlb(vcpu, true);
}
static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -5779,8 +5784,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
/* Also sync guest cr3 here in case we live migrate */
svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
mark_dirty(svm->vmcb, VMCB_CR);
-
- svm_flush_tlb(vcpu, true);
}
static int is_disabled(void)
@@ -7090,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.set_rflags = svm_set_rflags,
.tlb_flush = svm_flush_tlb,
+ .tlb_flush_gva = svm_flush_tlb_gva,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46b428c0990e..1519f030fd73 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -38,6 +38,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
+#include <asm/asm.h>
#include <asm/cpu.h>
#include <asm/io.h>
#include <asm/desc.h>
@@ -332,23 +333,54 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+enum ept_pointers_status {
+ EPT_POINTERS_CHECK = 0,
+ EPT_POINTERS_MATCH = 1,
+ EPT_POINTERS_MISMATCH = 2
+};
+
struct kvm_vmx {
struct kvm kvm;
unsigned int tss_addr;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
+
+ enum ept_pointers_status ept_pointers_match;
+ spinlock_t ept_pointer_lock;
};
#define NR_AUTOLOAD_MSRS 8
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
struct vmcs {
- u32 revision_id;
+ struct vmcs_hdr hdr;
u32 abort;
char data[0];
};
/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant. I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+ unsigned long cr3; /* May not match real cr3 */
+ unsigned long cr4; /* May not match real cr4 */
+ unsigned long gs_base;
+ unsigned long fs_base;
+
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+};
+
+/*
* Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
* remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
* loaded on this CPU (so we can clear them if the CPU goes down).
@@ -359,14 +391,13 @@ struct loaded_vmcs {
int cpu;
bool launched;
bool nmi_known_unmasked;
- unsigned long vmcs_host_cr3; /* May not match real cr3 */
- unsigned long vmcs_host_cr4; /* May not match real cr4 */
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
+ struct vmcs_host_state host_state;
};
struct shared_msr_entry {
@@ -397,7 +428,7 @@ struct __packed vmcs12 {
/* According to the Intel spec, a VMCS region must start with the
* following two fields. Then follow implementation-specific data.
*/
- u32 revision_id;
+ struct vmcs_hdr hdr;
u32 abort;
u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
@@ -565,7 +596,7 @@ struct __packed vmcs12 {
"Offset of " #field " in struct vmcs12 has changed.")
static inline void vmx_check_vmcs12_offsets(void) {
- CHECK_OFFSET(revision_id, 0);
+ CHECK_OFFSET(hdr, 0);
CHECK_OFFSET(abort, 4);
CHECK_OFFSET(launch_state, 8);
CHECK_OFFSET(io_bitmap_a, 40);
@@ -784,6 +815,12 @@ struct nested_vmx {
*/
struct vmcs12 *cached_vmcs12;
/*
+ * Cache of the guest's shadow VMCS, existing outside of guest
+ * memory. Loaded from guest memory during VM entry. Flushed
+ * to guest memory during VM exit.
+ */
+ struct vmcs12 *cached_shadow_vmcs12;
+ /*
* Indicates if the shadow vmcs must be updated with the
* data hold by vmcs12
*/
@@ -933,25 +970,20 @@ struct vcpu_vmx {
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
- * guest (L2), it points to a different VMCS.
+ * guest (L2), it points to a different VMCS. loaded_cpu_state points
+ * to the VMCS whose state is loaded into the CPU registers that only
+ * need to be switched when transitioning to/from the kernel; a NULL
+ * value indicates that host state is loaded.
*/
struct loaded_vmcs vmcs01;
struct loaded_vmcs *loaded_vmcs;
+ struct loaded_vmcs *loaded_cpu_state;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
struct vmx_msrs guest;
struct vmx_msrs host;
} msr_autoload;
- struct {
- int loaded;
- u16 fs_sel, gs_sel, ldt_sel;
-#ifdef CONFIG_X86_64
- u16 ds_sel, es_sel;
-#endif
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- u64 msr_host_bndcfgs;
- } host_state;
+
struct {
int vm86_active;
ulong save_rflags;
@@ -1001,6 +1033,7 @@ struct vcpu_vmx {
*/
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+ u64 ept_pointer;
};
enum segment_cache_field {
@@ -1220,6 +1253,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->nested.cached_vmcs12;
}
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@ -1490,6 +1528,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
* GUEST_IA32_RTIT_CTL = 0x00002814,
*/
}
+
+/* check_ept_pointer() should be under protection of ept_pointer_lock. */
+static void check_ept_pointer_match(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ u64 tmp_eptp = INVALID_PAGE;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!VALID_PAGE(tmp_eptp)) {
+ tmp_eptp = to_vmx(vcpu)->ept_pointer;
+ } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_MISMATCH;
+ return;
+ }
+ }
+
+ to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
+}
+
+static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
+{
+ int ret;
+
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
+ check_ept_pointer_match(kvm);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ ret = hyperv_flush_guest_mapping(
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
+
+out:
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ return ret;
+}
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -1864,6 +1944,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
CPU_BASED_MONITOR_TRAP_FLAG;
}
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
return vmcs12->cpu_based_vm_exec_control & bit;
@@ -1944,6 +2030,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
VMX_VMFUNC_EPTP_SWITCHING);
}
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1974,11 +2065,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
u64 rsvd : 48;
u64 gva;
} operand = { vpid, 0, gva };
+ bool error;
- asm volatile (__ex(ASM_VMX_INVVPID)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:"
- : : "a"(&operand), "c"(ext) : "cc", "memory");
+ asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
+ : "memory");
+ BUG_ON(error);
}
static inline void __invept(int ext, u64 eptp, gpa_t gpa)
@@ -1986,11 +2078,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
struct {
u64 eptp, gpa;
} operand = {eptp, gpa};
+ bool error;
- asm volatile (__ex(ASM_VMX_INVEPT)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:\n"
- : : "a" (&operand), "c" (ext) : "cc", "memory");
+ asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
+ : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
+ : "memory");
+ BUG_ON(error);
}
static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
@@ -2006,12 +2099,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
static void vmcs_clear(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- u8 error;
+ bool error;
- asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
+ asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
}
@@ -2028,15 +2121,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
static void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- u8 error;
+ bool error;
if (static_branch_unlikely(&enable_evmcs))
return evmcs_load(phys_addr);
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
vmcs, phys_addr);
}
@@ -2114,6 +2207,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
__loaded_vmcs_clear, loaded_vmcs, 1);
}
+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return true;
+
+ if (cpu_has_vmx_invvpid_individual_addr()) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ return true;
+ }
+
+ return false;
+}
+
static inline void vpid_sync_vcpu_single(int vpid)
{
if (vpid == 0)
@@ -2248,10 +2354,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value)
static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
{
- u8 error;
+ bool error;
- asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
- : "=q"(error) : "a"(value), "d"(field) : "cc");
+ asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(value), "d"(field));
if (unlikely(error))
vmwrite_error(field, value);
}
@@ -2735,121 +2841,150 @@ static unsigned long segment_base(u16 selector)
}
#endif
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
- unsigned long fs_base, kernel_gs_base;
#endif
+ unsigned long fs_base, gs_base;
+ u16 fs_sel, gs_sel;
int i;
- if (vmx->host_state.loaded)
+ if (vmx->loaded_cpu_state)
return;
- vmx->host_state.loaded = 1;
+ vmx->loaded_cpu_state = vmx->loaded_vmcs;
+ host_state = &vmx->loaded_cpu_state->host_state;
+
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
- vmx->host_state.ldt_sel = kvm_read_ldt();
- vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+ host_state->ldt_sel = kvm_read_ldt();
#ifdef CONFIG_X86_64
+ savesegment(ds, host_state->ds_sel);
+ savesegment(es, host_state->es_sel);
+
+ gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
save_fsgs_for_kvm();
- vmx->host_state.fs_sel = current->thread.fsindex;
- vmx->host_state.gs_sel = current->thread.gsindex;
+ fs_sel = current->thread.fsindex;
+ gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- kernel_gs_base = current->thread.gsbase;
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
-#endif
- savesegment(fs, vmx->host_state.fs_sel);
- savesegment(gs, vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
- }
-#endif
- if (!(vmx->host_state.fs_sel & 7)) {
- vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
- vmx->host_state.fs_reload_needed = 0;
- } else {
- vmcs_write16(HOST_FS_SELECTOR, 0);
- vmx->host_state.fs_reload_needed = 1;
+ vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- if (!(vmx->host_state.gs_sel & 7))
- vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
- else {
- vmcs_write16(HOST_GS_SELECTOR, 0);
- vmx->host_state.gs_ldt_reload_needed = 1;
- }
-
-#ifdef CONFIG_X86_64
- savesegment(ds, vmx->host_state.ds_sel);
- savesegment(es, vmx->host_state.es_sel);
- vmcs_writel(HOST_FS_BASE, fs_base);
- vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
-
- vmx->msr_host_kernel_gs_base = kernel_gs_base;
if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
- vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
- vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = segment_base(fs_sel);
+ gs_base = segment_base(gs_sel);
#endif
- if (boot_cpu_has(X86_FEATURE_MPX))
- rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
+
+ if (unlikely(fs_sel != host_state->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host_state->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host_state->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host_state->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host_state->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host_state->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host_state->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host_state->gs_base = gs_base;
+ }
+
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
- if (!vmx->host_state.loaded)
+ struct vmcs_host_state *host_state;
+
+ if (!vmx->loaded_cpu_state)
return;
+ WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
+ host_state = &vmx->loaded_cpu_state->host_state;
+
++vmx->vcpu.stat.host_state_reload;
- vmx->host_state.loaded = 0;
+ vmx->loaded_cpu_state = NULL;
+
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu))
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
- if (vmx->host_state.gs_ldt_reload_needed) {
- kvm_load_ldt(vmx->host_state.ldt_sel);
+ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+ kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
- load_gs_index(vmx->host_state.gs_sel);
+ load_gs_index(host_state->gs_sel);
#else
- loadsegment(gs, vmx->host_state.gs_sel);
+ loadsegment(gs, host_state->gs_sel);
#endif
}
- if (vmx->host_state.fs_reload_needed)
- loadsegment(fs, vmx->host_state.fs_sel);
+ if (host_state->fs_sel & 7)
+ loadsegment(fs, host_state->fs_sel);
#ifdef CONFIG_X86_64
- if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
- loadsegment(ds, vmx->host_state.ds_sel);
- loadsegment(es, vmx->host_state.es_sel);
+ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+ loadsegment(ds, host_state->ds_sel);
+ loadsegment(es, host_state->es_sel);
}
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
- if (vmx->host_state.msr_host_bndcfgs)
- wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
load_fixmap_gdt(raw_smp_processor_id());
}
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
- preempt_disable();
- __vmx_load_host_state(vmx);
- preempt_enable();
+ if (is_long_mode(&vmx->vcpu)) {
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE,
+ vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
+ }
+ return vmx->msr_guest_kernel_gs_base;
}
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ if (is_long_mode(&vmx->vcpu)) {
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
+ }
+ vmx->msr_guest_kernel_gs_base = data;
+}
+#endif
+
static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -2991,7 +3126,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
- __vmx_load_host_state(to_vmx(vcpu));
+ vmx_prepare_switch_to_host(to_vmx(vcpu));
}
static bool emulation_required(struct kvm_vcpu *vcpu)
@@ -3212,7 +3347,7 @@ static bool vmx_rdtscp_supported(void)
static bool vmx_invpcid_supported(void)
{
- return cpu_has_vmx_invpcid() && enable_ept;
+ return cpu_has_vmx_invpcid();
}
/*
@@ -3455,6 +3590,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING;
+ /*
+ * We can emulate "VMCS shadowing," even if the hardware
+ * doesn't support it.
+ */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_SHADOW_VMCS;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -3922,8 +4063,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(vmx);
- msr_info->data = vmx->msr_guest_kernel_gs_base;
+ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
break;
#endif
case MSR_EFER:
@@ -4023,8 +4163,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(vmx);
- vmx->msr_guest_kernel_gs_base = data;
+ vmx_write_guest_kernel_gs_base(vmx, data);
break;
#endif
case MSR_IA32_SYSENTER_CS:
@@ -4559,7 +4698,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
return 0;
}
-static struct vmcs *alloc_vmcs_cpu(int cpu)
+static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
{
int node = cpu_to_node(cpu);
struct page *pages;
@@ -4573,10 +4712,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
/* KVM supports Enlightened VMCS v1 only */
if (static_branch_unlikely(&enable_evmcs))
- vmcs->revision_id = KVM_EVMCS_VERSION;
+ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+ if (shadow)
+ vmcs->hdr.shadow_vmcs = 1;
return vmcs;
}
@@ -4600,14 +4741,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}
-static struct vmcs *alloc_vmcs(void)
+static struct vmcs *alloc_vmcs(bool shadow)
{
- return alloc_vmcs_cpu(raw_smp_processor_id());
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
}
static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
- loaded_vmcs->vmcs = alloc_vmcs();
+ loaded_vmcs->vmcs = alloc_vmcs(false);
if (!loaded_vmcs->vmcs)
return -ENOMEM;
@@ -4629,6 +4770,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
}
+
+ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+
return 0;
out_vmcs:
@@ -4738,7 +4882,7 @@ static __init int alloc_kvm_area(void)
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
- vmcs = alloc_vmcs_cpu(cpu);
+ vmcs = alloc_vmcs_cpu(false, cpu);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
@@ -4755,7 +4899,7 @@ static __init int alloc_kvm_area(void)
* physical CPU.
*/
if (static_branch_unlikely(&enable_evmcs))
- vmcs->revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -4912,10 +5056,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return;
/*
- * Force kernel_gs_base reloading before EFER changes, as control
- * of this msr depends on is_long_mode().
+ * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
+ * 64-bit mode as a 64-bit kernel may frequently access the
+ * MSR. This means we need to manually save/restore the MSR
+ * when switching between guest and host state, but only if
+ * the guest is in 64-bit mode. Sync our cached value if the
+ * guest is transitioning to 32-bit mode and the CPU contains
+ * guest state, i.e. the cache is stale.
*/
- vmx_load_host_state(to_vmx(vcpu));
+#ifdef CONFIG_X86_64
+ if (!(efer & EFER_LMA))
+ (void)vmx_read_guest_kernel_gs_base(vmx);
+#endif
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -4972,6 +5124,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
}
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ int vpid = to_vmx(vcpu)->vpid;
+
+ if (!vpid_sync_vcpu_addr(vpid, addr))
+ vpid_sync_context(vpid);
+
+ /*
+ * If VPIDs are not supported or enabled, then the above is a no-op.
+ * But we don't really need a TLB flush in that case anyway, because
+ * each VM entry/exit includes an implicit flush when VPID is 0.
+ */
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -5153,6 +5319,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm *kvm = vcpu->kvm;
unsigned long guest_cr3;
u64 eptp;
@@ -5160,15 +5327,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
+
+ if (kvm_x86_ops->tlb_remote_flush) {
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ to_vmx(vcpu)->ept_pointer = eptp;
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_CHECK;
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ }
+
if (enable_unrestricted_guest || is_paging(vcpu) ||
is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
else
- guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
+ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
- vmx_flush_tlb(vcpu, true);
vmcs_writel(GUEST_CR3, guest_cr3);
}
@@ -6104,19 +6279,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
*/
cr3 = __read_cr3();
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
- vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
- vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
#ifdef CONFIG_X86_64
/*
* Load null selectors, so we can avoid reloading them in
- * __vmx_load_host_state(), in case userspace uses the null selectors
- * too (the expected case).
+ * vmx_prepare_switch_to_host(), in case userspace uses
+ * the null selectors too (the expected case).
*/
vmcs_write16(HOST_DS_SELECTOR, 0);
vmcs_write16(HOST_ES_SELECTOR, 0);
@@ -6241,8 +6416,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
enable_unrestricted_guest = 0;
- /* Enable INVPCID for non-ept guests may cause performance regression. */
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -6371,9 +6544,6 @@ static void ept_set_mmio_spte_mask(void)
*/
static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
-#ifdef CONFIG_X86_64
- unsigned long a;
-#endif
int i;
if (enable_shadow_vmcs) {
@@ -6428,15 +6598,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
vmx_set_constant_host_state(vmx);
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -7670,6 +7833,7 @@ static void vmx_enable_tdp(void)
static __init int hardware_setup(void)
{
+ unsigned long host_bndcfgs;
int r = -ENOMEM, i;
rdmsrl_safe(MSR_EFER, &host_efer);
@@ -7694,6 +7858,11 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
+ if (boot_cpu_has(X86_FEATURE_MPX)) {
+ rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ }
+
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
@@ -7730,6 +7899,12 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+ && enable_ept)
+ kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
+#endif
+
if (!cpu_has_vmx_ple()) {
ple_gap = 0;
ple_window = 0;
@@ -7756,6 +7931,11 @@ static __init int hardware_setup(void)
else
kvm_disable_tdp();
+ if (!nested) {
+ kvm_x86_ops->get_nested_state = NULL;
+ kvm_x86_ops->set_nested_state = NULL;
+ }
+
/*
* Only enable PML when hardware supports PML feature, and both EPT
* and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -8032,10 +8212,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
return 0;
}
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+ /*
+ * We should allocate a shadow vmcs for vmcs01 only when L1
+ * executes VMXON and free it when L1 executes VMXOFF.
+ * As it is invalid to execute VMXON twice, we shouldn't reach
+ * here when vmcs01 already have an allocated shadow vmcs.
+ */
+ WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+
+ if (!loaded_vmcs->shadow_vmcs) {
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+ }
+ return loaded_vmcs->shadow_vmcs;
+}
+
static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs *shadow_vmcs;
int r;
r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
@@ -8046,16 +8251,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
- if (enable_shadow_vmcs) {
- shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
- goto out_shadow_vmcs;
- /* mark vmcs as shadow */
- shadow_vmcs->revision_id |= (1u << 31);
- /* init shadow vmcs */
- vmcs_clear(shadow_vmcs);
- vmx->vmcs01.shadow_vmcs = shadow_vmcs;
- }
+ vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_shadow_vmcs12)
+ goto out_cached_shadow_vmcs12;
+
+ if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+ goto out_shadow_vmcs;
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
@@ -8067,6 +8268,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
return 0;
out_shadow_vmcs:
+ kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
kfree(vmx->nested.cached_vmcs12);
out_cached_vmcs12:
@@ -8109,7 +8313,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
/* CPL=0 must be checked manually. */
if (vmx_get_cpl(vcpu)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -8172,15 +8376,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
*/
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
- if (vmx_get_cpl(vcpu)) {
+ if (!to_vmx(vcpu)->nested.vmxon) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
}
- if (!to_vmx(vcpu)->nested.vmxon) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
return 0;
}
+
return 1;
}
@@ -8233,6 +8438,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
+ kfree(vmx->nested.cached_shadow_vmcs12);
/* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -8318,7 +8524,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
* some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
* 64-bit fields are to be returned).
*/
-static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
unsigned long field, u64 *ret)
{
short offset = vmcs_field_to_offset(field);
@@ -8327,7 +8533,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
if (offset < 0)
return offset;
- p = ((char *)(get_vmcs12(vcpu))) + offset;
+ p = (char *)vmcs12 + offset;
switch (vmcs_field_width(field)) {
case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
@@ -8349,10 +8555,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
}
-static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
unsigned long field, u64 field_value){
short offset = vmcs_field_to_offset(field);
- char *p = ((char *) get_vmcs12(vcpu)) + offset;
+ char *p = (char *)vmcs12 + offset;
if (offset < 0)
return offset;
@@ -8405,7 +8611,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
field_value = __vmcs_readl(field);
- vmcs12_write_any(&vmx->vcpu, field, field_value);
+ vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
}
/*
* Skip the VM-exit information fields if they are read-only.
@@ -8440,7 +8646,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
for (q = 0; q < ARRAY_SIZE(fields); q++) {
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
- vmcs12_read_any(&vmx->vcpu, field, &field_value);
+ vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
__vmcs_writel(field, field_value);
}
}
@@ -8470,6 +8676,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gva_t gva = 0;
+ struct vmcs12 *vmcs12;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -8477,10 +8684,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_vmcs12(vcpu))
return kvm_skip_emulated_instruction(vcpu);
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+ }
+
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
- if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
+ if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8522,6 +8743,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
*/
u64 field_value = 0;
struct x86_exception e;
+ struct vmcs12 *vmcs12;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -8556,23 +8778,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
- if (vmcs12_write_any(vcpu, field, field_value) < 0) {
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+
+ }
+
+ if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
return kvm_skip_emulated_instruction(vcpu);
}
- switch (field) {
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode
+ * as we actually dirty shadow vmcs12 instead of vmcs12.
+ */
+ if (!is_guest_mode(vcpu)) {
+ switch (field) {
#define SHADOW_FIELD_RW(x) case x:
#include "vmx_shadow_fields.h"
- /*
- * The fields that can be updated by L1 without a vmexit are
- * always updated in the vmcs02, the others go down the slow
- * path of prepare_vmcs02.
- */
- break;
- default:
- vmx->nested.dirty_vmcs12 = true;
- break;
+ /*
+ * The fields that can be updated by L1 without a vmexit are
+ * always updated in the vmcs02, the others go down the slow
+ * path of prepare_vmcs02.
+ */
+ break;
+ default:
+ vmx->nested.dirty_vmcs12 = true;
+ break;
+ }
}
nested_vmx_succeed(vcpu);
@@ -8623,7 +8866,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
- if (new_vmcs12->revision_id != VMCS12_REVISION) {
+ if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ (new_vmcs12->hdr.shadow_vmcs &&
+ !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
kunmap(page);
kvm_release_page_clean(page);
nested_vmx_failValid(vcpu,
@@ -8821,6 +9066,105 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ bool pcid_enabled;
+ gva_t gva;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ if (type > 3) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* According to the Intel instruction reference, the memory operand
+ * is read even if it isn't needed (e.g., for type==all)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ /* fall-through */
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_mmu_unload(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+
static int handle_pml_full(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
@@ -9024,6 +9368,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmfunc,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
@@ -9196,6 +9541,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
return false;
}
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+ u32 vmx_instruction_info;
+ unsigned long field;
+ u8 b;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return true;
+
+ /* Decode instruction info and find the field to access */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ /* Out-of-range fields always cause a VM exit from L2 to L1 */
+ if (field >> 15)
+ return true;
+
+ if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+ return true;
+
+ return 1 & (b >> (field & 7));
+}
+
/*
* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
* should handle it ourselves in L0 (and then continue L2). Only call this
@@ -9280,10 +9649,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMREAD:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmread_bitmap);
+ case EXIT_REASON_VMWRITE:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmwrite_bitmap);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
- case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
- case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
/*
@@ -10244,15 +10618,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
}
cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
- vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
}
/* When single-stepping over STI and MOV SS, we must clear the
@@ -10448,9 +10822,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* The sysexit path does not restore ds/es, so we must set them to
* a reasonable value ourselves.
*
- * We can't defer this to vmx_load_host_state() since that function
- * may be executed in interrupt context, which saves and restore segments
- * around it, nullifying its effect.
+ * We can't defer this to vmx_prepare_switch_to_host() since that
+ * function may be executed in interrupt context, which saves and
+ * restore segments around it, nullifying its effect.
*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
@@ -10511,8 +10885,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
return;
cpu = get_cpu();
- vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
+ vmx->loaded_vmcs = vmcs;
vmx_vcpu_load(vcpu, cpu);
put_cpu();
}
@@ -10652,6 +11026,8 @@ free_vcpu:
static int vmx_vm_init(struct kvm *kvm)
{
+ spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -10876,11 +11252,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
return 1;
- kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
- nested_ept_ad_enabled(vcpu));
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_cr3(vcpu));
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -10928,9 +11304,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct page *page;
u64 hpa;
@@ -11171,6 +11547,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return true;
}
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vmcs12 *shadow;
+ struct page *page;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ shadow = get_shadow_vmcs12(vcpu);
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+
+ memcpy(shadow, kmap(page), VMCS12_SIZE);
+
+ kunmap(page);
+ kvm_release_page_clean(page);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+}
+
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -11228,11 +11636,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
unsigned long count_field,
unsigned long addr_field)
{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int maxphyaddr;
u64 count, addr;
- if (vmcs12_read_any(vcpu, count_field, &count) ||
- vmcs12_read_any(vcpu, addr_field, &addr)) {
+ if (vmcs12_read_any(vmcs12, count_field, &count) ||
+ vmcs12_read_any(vmcs12, addr_field, &addr)) {
WARN_ON(1);
return -EINVAL;
}
@@ -11282,6 +11691,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
+ !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
@@ -11431,12 +11853,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
return 1;
}
}
-
- vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
- kvm_mmu_reset_context(vcpu);
+ if (!nested_ept)
+ kvm_mmu_new_cr3(vcpu, cr3, false);
+
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_init_mmu(vcpu, false);
+
return 0;
}
@@ -11523,7 +11949,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
* Set host-state according to L0's settings (vmcs12 is irrelevant here)
* Some constant fields are set here by vmx_set_constant_host_state().
* Other fields are different per CPU, and will be set later when
- * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+ * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
+ * is called.
*/
vmx_set_constant_host_state(vmx);
@@ -11595,11 +12022,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
- /*
- * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
- * HOST_FS_BASE, HOST_GS_BASE.
- */
-
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
@@ -11664,6 +12086,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control |= vmcs12_exec_ctrl;
}
+ /* VMCS shadowing for L2 is emulated for now */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
@@ -11883,6 +12308,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_pml_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
vmx->nested.msrs.procbased_ctls_low,
vmx->nested.msrs.procbased_ctls_high) ||
@@ -11983,6 +12411,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
return 0;
}
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int r;
+ struct page *page;
+ struct vmcs12 *shadow;
+
+ if (vmcs12->vmcs_link_pointer == -1ull)
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
+ return -EINVAL;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+ if (is_error_page(page))
+ return -EINVAL;
+
+ r = 0;
+ shadow = kmap(page);
+ if (shadow->hdr.revision_id != VMCS12_REVISION ||
+ shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+ r = -EINVAL;
+ kunmap(page);
+ kvm_release_page_clean(page);
+ return r;
+}
+
static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 *exit_qual)
{
@@ -11994,8 +12449,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
return 1;
- if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
return 1;
}
@@ -12042,12 +12496,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
+/*
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
+ */
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 exit_qual;
- int r;
+ bool from_vmentry = !!exit_qual;
+ u32 dummy_exit_qual;
+ int r = 0;
enter_guest_mode(vcpu);
@@ -12061,17 +12520,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
r = EXIT_REASON_INVALID_STATE;
- if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
goto fail;
- nested_get_vmcs12_pages(vcpu, vmcs12);
+ if (from_vmentry) {
+ nested_get_vmcs12_pages(vcpu);
- r = EXIT_REASON_MSR_LOAD_FAIL;
- exit_qual = nested_vmx_load_msr(vcpu,
- vmcs12->vm_entry_msr_load_addr,
- vmcs12->vm_entry_msr_load_count);
- if (exit_qual)
- goto fail;
+ r = EXIT_REASON_MSR_LOAD_FAIL;
+ *exit_qual = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (*exit_qual)
+ goto fail;
+ } else {
+ /*
+ * The MMU is not initialized to point at the right entities yet and
+ * "get pages" would need to read data from the guest (i.e. we will
+ * need to perform gpa to hpa translation). Request a call
+ * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
+ * have already been set at vmentry time and should not be reset.
+ */
+ kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ }
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -12086,8 +12556,7 @@ fail:
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
- return 1;
+ return r;
}
/*
@@ -12110,6 +12579,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmcs12 = get_vmcs12(vcpu);
+ /*
+ * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+ * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+ * rather than RFLAGS.ZF, and no error number is stored to the
+ * VM-instruction error field.
+ */
+ if (vmcs12->hdr.shadow_vmcs) {
+ nested_vmx_failInvalid(vcpu);
+ goto out;
+ }
+
if (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
@@ -12164,16 +12644,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
*/
vmx->nested.nested_run_pending = 1;
- ret = enter_vmx_non_root_mode(vcpu);
+ ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
vmx->nested.nested_run_pending = 0;
- return ret;
+ return 1;
}
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
/*
+ * Must happen outside of enter_vmx_non_root_mode() as it will
+ * also be used as part of restoring nVMX state for
+ * snapshot restore (migration).
+ *
+ * In this flow, it is assumed that vmcs12 cache was
+ * trasferred as part of captured nVMX state and should
+ * therefore not be read from guest memory (which may not
+ * exist on destination host yet).
+ */
+ nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
+ /*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
*/
@@ -12682,6 +13175,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ /*
+ * Must happen outside of sync_vmcs12() as it will
+ * also be used to capture vmcs12 cache as part of
+ * capturing nVMX state for snapshot (migration).
+ *
+ * Otherwise, this flush will dirty guest memory at a
+ * point it is already assumed by user-space to be
+ * immutable.
+ */
+ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
@@ -13256,7 +13760,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
if (vmx->nested.smm.guest_mode) {
vcpu->arch.hflags &= ~HF_SMM_MASK;
- ret = enter_vmx_non_root_mode(vcpu);
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
vcpu->arch.hflags |= HF_SMM_MASK;
if (ret)
return ret;
@@ -13271,6 +13775,199 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_vmx *vmx;
+ struct vmcs12 *vmcs12;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = 0,
+ .size = sizeof(kvm_state),
+ .vmx.vmxon_pa = -1ull,
+ .vmx.vmcs_pa = -1ull,
+ };
+
+ if (!vcpu)
+ return kvm_state.size + 2 * VMCS12_SIZE;
+
+ vmx = to_vmx(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+ if (nested_vmx_allowed(vcpu) &&
+ (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+ kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+ kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+ if (vmx->nested.current_vmptr != -1ull) {
+ kvm_state.size += VMCS12_SIZE;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull)
+ kvm_state.size += VMCS12_SIZE;
+ }
+
+ if (vmx->nested.smm.vmxon)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+ if (vmx->nested.smm.guest_mode)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+ if (is_guest_mode(vcpu)) {
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (vmx->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+ }
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (vmx->nested.current_vmptr == -1ull)
+ goto out;
+
+ /*
+ * When running L2, the authoritative vmcs12 state is in the
+ * vmcs02. When running L1, the authoritative vmcs12 state is
+ * in the shadow vmcs linked to vmcs01, unless
+ * sync_shadow_vmcs is set, in which case, the authoritative
+ * vmcs12 state is in the vmcs12 already.
+ */
+ if (is_guest_mode(vcpu))
+ sync_vmcs12(vcpu, vmcs12);
+ else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
+ get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
+ return -EFAULT;
+ }
+
+out:
+ return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 exit_qual;
+ int ret;
+
+ if (kvm_state->format != 0)
+ return -EINVAL;
+
+ if (!nested_vmx_allowed(vcpu))
+ return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+ if (kvm_state->vmx.vmxon_pa == -1ull) {
+ if (kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
+ if (kvm_state->vmx.vmcs_pa != -1ull)
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+ return -EINVAL;
+
+ if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ return -EINVAL;
+
+ if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (kvm_state->vmx.smm.flags &
+ ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ if (kvm_state->vmx.vmxon_pa == -1ull)
+ return 0;
+
+ vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+ vmx->nested.smm.vmxon = true;
+ vmx->nested.vmxon = false;
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+ vmx->nested.smm.guest_mode = true;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (vmcs12->hdr.revision_id != VMCS12_REVISION)
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return 0;
+
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+ if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+ return -EINVAL;
+
+ if (copy_from_user(shadow_vmcs12,
+ user_kvm_nested_state->data + VMCS12_SIZE,
+ sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ !shadow_vmcs12->hdr.shadow_vmcs)
+ return -EINVAL;
+ }
+
+ if (check_vmentry_prereqs(vcpu, vmcs12) ||
+ check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ return -EINVAL;
+
+ if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+ vmx->nested.nested_run_pending = 1;
+
+ vmx->nested.dirty_vmcs12 = true;
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -13290,7 +13987,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.vcpu_free = vmx_free_vcpu,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = vmx_save_host_state,
+ .prepare_guest_switch = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
@@ -13323,6 +14020,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_rflags = vmx_set_rflags,
.tlb_flush = vmx_flush_tlb,
+ .tlb_flush_gva = vmx_flush_tlb_gva,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
@@ -13405,6 +14103,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.setup_mce = vmx_setup_mce,
+ .get_nested_state = vmx_get_nested_state,
+ .set_nested_state = vmx_set_nested_state,
+ .get_vmcs12_pages = nested_get_vmcs12_pages,
+
.smi_allowed = vmx_smi_allowed,
.pre_enter_smm = vmx_pre_enter_smm,
.pre_leave_smm = vmx_pre_leave_smm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a5caa5e5480c..f7dff0457846 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -848,16 +848,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ bool skip_tlb_flush = false;
#ifdef CONFIG_X86_64
bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
- if (pcid_enabled)
- cr3 &= ~CR3_PCID_INVD;
+ if (pcid_enabled) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ }
#endif
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ if (!skip_tlb_flush) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
return 0;
}
@@ -868,9 +873,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
+ kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- kvm_mmu_new_cr3(vcpu);
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2185,10 +2191,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
return 1;
if (data != 0 && data != ~(u64)0)
- return -1;
+ return 1;
vcpu->arch.mcg_ctl = data;
break;
default:
@@ -2576,7 +2583,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
u64 data;
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2591,7 +2598,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.mcg_cap;
break;
case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
+ if (!(mcg_cap & MCG_CTL_P) && !host)
return 1;
data = vcpu->arch.mcg_ctl;
break;
@@ -2724,7 +2731,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2745,7 +2753,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_get_msr_common(vcpu,
- msr_info->index, &msr_info->data);
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
@@ -2969,6 +2978,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X2APIC_API:
r = KVM_X2APIC_API_VALID_FLAGS;
break;
+ case KVM_CAP_NESTED_STATE:
+ r = kvm_x86_ops->get_nested_state ?
+ kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+ break;
default:
break;
}
@@ -3985,6 +3998,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
break;
}
+ case KVM_GET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ u32 user_data_size;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->get_nested_state)
+ break;
+
+ BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ if (get_user(user_data_size, &user_kvm_nested_state->size))
+ return -EFAULT;
+
+ r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+ user_data_size);
+ if (r < 0)
+ return r;
+
+ if (r > user_data_size) {
+ if (put_user(r, &user_kvm_nested_state->size))
+ return -EFAULT;
+ return -E2BIG;
+ }
+ r = 0;
+ break;
+ }
+ case KVM_SET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ struct kvm_nested_state kvm_state;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->set_nested_state)
+ break;
+
+ if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (kvm_state.size < sizeof(kvm_state))
+ return -EINVAL;
+
+ if (kvm_state.flags &
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ /* nested_run_pending implies guest_mode. */
+ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+ return -EINVAL;
+
+ r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -6503,8 +6566,12 @@ static void kvm_set_mmio_spte_mask(void)
* Set the reserved bits and the present bit of an paging-structure
* entry to generate page fault with PFER.RSV = 1.
*/
- /* Mask the reserved physical address bits. */
- mask = rsvd_bits(maxphyaddr, 51);
+
+ /*
+ * Mask the uppermost physical address bit, which would be reserved as
+ * long as the supported physical address width is less than 52.
+ */
+ mask = 1ull << 51;
/* Set the present bit. */
mask |= 1ull;
@@ -6769,6 +6836,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
+ case KVM_HC_SEND_IPI:
+ ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+ break;
#endif
default:
ret = -KVM_ENOSYS;
@@ -7287,6 +7357,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+ kvm_x86_ops->get_vmcs12_pages(vcpu);
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -7302,6 +7374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
+ kvm_mmu_load_cr3(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb(vcpu, true);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -8013,6 +8087,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ (sregs->cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
* When EFER.LME and CR0.PG are set, the processor is in
@@ -8043,10 +8121,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
struct desc_ptr dt;
int ret = -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- (sregs->cr4 & X86_CR4_OSXSAVE))
- goto out;
-
if (kvm_valid_sregs(vcpu, sregs))
goto out;