diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 34 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 51 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 116 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c (renamed from arch/x86/kvm/mmu.c) | 284 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c (renamed from arch/x86/kvm/page_track.c) | 0 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h (renamed from arch/x86/kvm/paging_tmpl.h) | 29 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 128 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 29 | ||||
-rw-r--r-- | arch/x86/kvm/pmu_amd.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 163 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 316 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 22 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 34 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 406 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 334 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 15 |
23 files changed, 1484 insertions, 586 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..b19ef421084d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9c5029cf6f3f..cfafa320a8cf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -504,7 +504,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, r = -E2BIG; - if (*nent >= maxnent) + if (WARN_ON(*nent >= maxnent)) goto out; do_host_cpuid(entry, function, 0); @@ -778,6 +778,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, case 0x8000001a: case 0x8000001e: break; + /* Support memory encryption cpuid if host supports it */ + case 0x8000001F: + if (!boot_cpu_has(X86_FEATURE_SEV)) + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -810,14 +815,15 @@ out: static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, int *nent, int maxnent, unsigned int type) { + if (*nent >= maxnent) + return -E2BIG; + if (type == KVM_GET_EMULATED_CPUID) return __do_cpuid_func_emulated(entry, func, nent, maxnent); return __do_cpuid_func(entry, func, nent, maxnent); } -#undef F - struct kvm_cpuid_param { u32 func; bool (*qualifier)(const struct kvm_cpuid_param *param); @@ -1015,6 +1021,12 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *ebx = entry->ebx; *ecx = entry->ecx; *edx = entry->edx; + if (function == 7 && index == 0) { + u64 data; + if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + (data & TSX_CTRL_CPUID_CLEAR)) + *ebx &= ~(F(RTM) | F(HLE)); + } } else { *eax = *ebx = *ecx = *edx = 0; /* diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 698efb8c3897..952d1a4f4d7e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2770,11 +2770,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); - setup_syscalls_segments(ctxt, &cs, &ss); - if (!(efer & EFER_SCE)) return emulate_ud(ctxt); + setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2838,12 +2837,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_PROT64) return X86EMUL_UNHANDLEABLE; - setup_syscalls_segments(ctxt, &cs, &ss); - ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); + setup_syscalls_segments(ctxt, &cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d859ae8890d0..9fd2dd89a1c5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -271,8 +271,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; - int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e; + unsigned long vcpu_bitmap; + int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -296,6 +297,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) /* Preserve read-only fields */ old_remote_irr = e->fields.remote_irr; old_delivery_status = e->fields.delivery_status; + old_dest_id = e->fields.dest_id; + old_dest_mode = e->fields.dest_mode; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; @@ -321,7 +324,34 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_make_scan_ioapic_request(ioapic->kvm); + if (e->fields.delivery_mode == APIC_DM_FIXED) { + struct kvm_lapic_irq irq; + + irq.shorthand = 0; + irq.vector = e->fields.vector; + irq.delivery_mode = e->fields.delivery_mode << 8; + irq.dest_id = e->fields.dest_id; + irq.dest_mode = e->fields.dest_mode; + bitmap_zero(&vcpu_bitmap, 16); + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + if (old_dest_mode != e->fields.dest_mode || + old_dest_id != e->fields.dest_id) { + /* + * Update vcpu_bitmap with vcpus specified in + * the previous request as well. This is done to + * keep ioapic_handled_vectors synchronized. + */ + irq.dest_id = old_dest_id; + irq.dest_mode = old_dest_mode; + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + } + kvm_make_scan_ioapic_request_mask(ioapic->kvm, + &vcpu_bitmap); + } else { + kvm_make_scan_ioapic_request(ioapic->kvm); + } break; } } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1cc6c47dc77e..58767020de41 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -37,22 +37,50 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) { - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) +{ + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return 0; + + if (!kvm_register_is_available(vcpu, reg)) kvm_x86_ops->cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; } -static inline void kvm_register_write(struct kvm_vcpu *vcpu, - enum kvm_reg reg, +static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) { + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return; + vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_dirty(vcpu, reg); } static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) @@ -79,9 +107,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -109,8 +136,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - kvm_x86_ops->decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 87b0fcc23ef8..cf9177b4a07f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -111,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline u8 kvm_xapic_id(struct kvm_lapic *apic) -{ - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; @@ -562,60 +557,53 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, + struct kvm_lapic_irq *irq, u32 min) +{ + int i, count = 0; + struct kvm_vcpu *vcpu; + + if (min > map->max_apic_id) + return 0; + + for_each_set_bit(i, ipi_bitmap, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, irq, NULL); + } + } + + return count; +} + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { - int i; struct kvm_apic_map *map; - struct kvm_vcpu *vcpu; struct kvm_lapic_irq irq = {0}; int cluster_size = op_64_bit ? 64 : 32; - int count = 0; + int count; + + if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK)) + return -KVM_EINVAL; irq.vector = icr & APIC_VECTOR_MASK; irq.delivery_mode = icr & APIC_MODE_MASK; irq.level = (icr & APIC_INT_ASSERT) != 0; irq.trig_mode = icr & APIC_INT_LEVELTRIG; - if (icr & APIC_DEST_MASK) - return -KVM_EINVAL; - if (icr & APIC_SHORT_MASK) - return -KVM_EINVAL; - rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (unlikely(!map)) { - count = -EOPNOTSUPP; - goto out; - } - - if (min > map->max_apic_id) - goto out; - /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } - } - - min += cluster_size; - - if (min > map->max_apic_id) - goto out; - - for_each_set_bit(i, &ipi_bitmap_high, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } + count = -EOPNOTSUPP; + if (likely(map)) { + count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min); + min += cluster_size; + count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min); } -out: rcu_read_unlock(); return count; } @@ -1129,6 +1117,50 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } +/* + * This routine identifies the destination vcpus mask meant to receive the + * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find + * out the destination vcpus array and set the bitmap or it traverses to + * each available vcpu to identify the same. + */ +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic **dest_vcpu = NULL; + struct kvm_lapic *src = NULL; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + unsigned long bitmap; + int i, vcpu_idx; + bool ret; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu, + &bitmap); + if (ret) { + for_each_set_bit(i, &bitmap, 16) { + if (!dest_vcpu[i]) + continue; + vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx; + __set_bit(vcpu_idx, vcpu_bitmap); + } + } else { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + if (!kvm_apic_match_dest(vcpu, NULL, + irq->delivery_mode, + irq->dest_id, + irq->dest_mode)) + continue; + __set_bit(i, vcpu_bitmap); + } + } + rcu_read_unlock(); +} + int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; @@ -2714,7 +2746,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs * and leave the INIT pending. */ - if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { + if (kvm_vcpu_latch_init(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2aad7e226fc0..39925afdfcdc 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -226,6 +226,9 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap); + bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, @@ -242,4 +245,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11f8ec89433b..d55674f44a18 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24c23c66b226..6f92b40d798c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/pat.h> @@ -47,6 +48,35 @@ #include <asm/kvm_page_track.h> #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +#ifdef CONFIG_PREEMPT_RT +/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ +static uint __read_mostly nx_huge_pages_recovery_ratio = 0; +#else +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; +#endif + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -352,6 +382,11 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + static inline u64 spte_shadow_accessed_mask(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); @@ -1190,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1207,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -2792,6 +2845,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return list_unstable; } @@ -3013,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3233,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3248,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3258,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3306,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3550,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3588,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4174,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4184,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -4214,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4295,7 +4395,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } /* @@ -5914,9 +6014,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -6155,10 +6255,59 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + mutex_lock(&kvm->slots_lock); + kvm_mmu_zap_all_fast(kvm); + mutex_unlock(&kvm->slots_lock); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave @@ -6238,3 +6387,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c index 3521e2d176f2..3521e2d176f2 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d5cdb3af594..97b21e7fd013 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 46875bbd0419..bcc6a73d6628 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -62,8 +62,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } @@ -76,8 +75,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); @@ -137,7 +135,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc_to_pmu(pmc)->event_count++; + clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); +} + +static void pmc_pause_counter(struct kvm_pmc *pmc) +{ + u64 counter = pmc->counter; + + if (!pmc->perf_event) + return; + + /* update counter, reset event value to avoid redundant accumulation */ + counter += perf_event_pause(pmc->perf_event, true); + pmc->counter = counter & pmc_bitmask(pmc); +} + +static bool pmc_resume_counter(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event) + return false; + + /* recalibrate sample period and check if it's accepted by perf core */ + if (perf_event_period(pmc->perf_event, + (-pmc->counter) & pmc_bitmask(pmc))) + return false; + + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ + perf_event_enable(pmc->perf_event); + + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + return true; } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) @@ -154,7 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc->eventsel = eventsel; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; @@ -193,6 +221,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (type == PERF_TYPE_RAW) config = eventsel & X86_RAW_EVENT_MASK; + if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = eventsel; pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), @@ -209,7 +243,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) struct kvm_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; @@ -224,6 +258,12 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) return; } + if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ @@ -253,27 +293,32 @@ EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 bitmask; int bit; - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + clear_bit(bit, pmu->reprogram_pmi); continue; } reprogram_counter(pmu, bit); } + + /* + * Unused perf_events are only released if the corresponding MSRs + * weren't accessed during the last vCPU time slice. kvm_arch_sched_in + * triggers KVM_REQ_PMU if cleanup is needed. + */ + if (unlikely(pmu->need_cleanup)) + kvm_pmu_cleanup(vcpu); } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); + return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -323,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask); + pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; @@ -339,7 +384,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); + return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) || + kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); +} + +static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr); + + if (pmc) + __set_bit(pmc->idx, pmu->pmc_in_use); } int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -349,6 +404,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); } @@ -376,9 +432,45 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); kvm_x86_ops->pmu_ops->init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + pmu->event_count = 0; + pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +/* Release perf_events for vPMCs that have been unused for a full time slice. */ +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); + int i; + + pmu->need_cleanup = false; + + bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, + pmu->pmc_in_use, X86_PMC_IDX_MAX); + + for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) + pmc_stop_counter(pmc); + } + + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); +} + void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); @@ -416,8 +508,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) *filter = tmp; mutex_lock(&kvm->lock); - rcu_swap_protected(kvm->arch.pmu_event_filter, filter, - mutex_is_locked(&kvm->lock)); + filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 58265f761c3b..7ebb62326c14 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -25,9 +25,10 @@ struct kvm_pmu_ops { unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); - struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx, - u64 *mask); - int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); + struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); + int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -55,12 +56,21 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static inline void pmc_stop_counter(struct kvm_pmc *pmc) +static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static inline void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); } } @@ -79,6 +89,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc) return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); } +static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, + u64 data) +{ + return !(pmu->global_ctrl_mask & data); +} + /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a * paramenter to tell them apart. @@ -110,13 +126,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx); +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index c8388389a3b0..ce0b10fe5e2b 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -174,7 +174,7 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -184,7 +184,8 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) } /* idx is the ECX register of RDPMC instruction */ -static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask) +static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *counters; @@ -199,13 +200,19 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ + return false; +} + +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int ret = false; + struct kvm_pmc *pmc; - ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) || - get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); + pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); - return ret; + return pmc; } static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -272,6 +279,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; pmu->global_status = 0; + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) @@ -285,6 +293,7 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } } @@ -306,8 +315,9 @@ struct kvm_pmu_ops amd_pmu_ops = { .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, - .is_valid_msr_idx = amd_is_valid_msr_idx, + .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx, .is_valid_msr = amd_is_valid_msr, .get_msr = amd_pmu_get_msr, .set_msr = amd_pmu_set_msr, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8ecb6df5106..122d4ce3b1ab 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,6 +38,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/rwsem.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -418,9 +419,13 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +static int sev_flush_asids(void); +static DECLARE_RWSEM(sev_deactivate_lock); +static DEFINE_MUTEX(sev_bitmap_lock); static unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; +static unsigned long *sev_reclaim_asid_bitmap; #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) struct enc_region { @@ -734,8 +739,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -1229,11 +1240,15 @@ static __init int sev_hardware_setup(void) /* Minimum ASID value that should be used for SEV guest */ min_sev_asid = cpuid_edx(0x8000001F); - /* Initialize SEV ASID bitmap */ + /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; + sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + if (!sev_reclaim_asid_bitmap) + return 1; + status = kmalloc(sizeof(*status), GFP_KERNEL); if (!status) return 1; @@ -1412,8 +1427,12 @@ static __exit void svm_hardware_unsetup(void) { int cpu; - if (svm_sev_enabled()) + if (svm_sev_enabled()) { bitmap_free(sev_asid_bitmap); + bitmap_free(sev_reclaim_asid_bitmap); + + sev_flush_asids(); + } for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1723,25 +1742,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static void __sev_asid_free(int asid) +static void sev_asid_free(int asid) { struct svm_cpu_data *sd; int cpu, pos; + mutex_lock(&sev_bitmap_lock); + pos = asid - 1; - clear_bit(pos, sev_asid_bitmap); + __set_bit(pos, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); sd->sev_vmcbs[pos] = NULL; } -} - -static void sev_asid_free(struct kvm *kvm) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - __sev_asid_free(sev->asid); + mutex_unlock(&sev_bitmap_lock); } static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) @@ -1758,10 +1774,12 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) /* deactivate handle */ data->handle = handle; + + /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ + down_read(&sev_deactivate_lock); sev_guest_deactivate(data, NULL); + up_read(&sev_deactivate_lock); - wbinvd_on_all_cpus(); - sev_guest_df_flush(NULL); kfree(data); decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); @@ -1910,7 +1928,7 @@ static void sev_vm_destroy(struct kvm *kvm) mutex_unlock(&kvm->lock); sev_unbind_asid(kvm, sev->handle); - sev_asid_free(kvm); + sev_asid_free(sev->asid); } static void avic_vm_destroy(struct kvm *kvm) @@ -2364,7 +2382,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - BUG(); + WARN_ON_ONCE(1); } } @@ -2517,10 +2535,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { } -static void svm_decache_cr3(struct kvm_vcpu *vcpu) -{ -} - static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -4591,6 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); if (ldr == svm->ldr_reg) return 0; @@ -4598,7 +4613,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); if (ldr) - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + ret = avic_ldr_write(vcpu, id, ldr); if (!ret) svm->ldr_reg = ldr; @@ -4610,8 +4625,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) { u64 *old, *new; struct vcpu_svm *svm = to_svm(vcpu); - u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); - u32 id = (apic_id_reg >> 24) & 0xff; + u32 id = kvm_xapic_id(vcpu->arch.apic); if (vcpu->vcpu_id == id) return 0; @@ -4991,6 +5005,18 @@ static int handle_exit(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_RETPOLINE + if (exit_code == SVM_EXIT_MSR) + return msr_interception(svm); + else if (exit_code == SVM_EXIT_VINTR) + return interrupt_window_interception(svm); + else if (exit_code == SVM_EXIT_INTR) + return intr_interception(svm); + else if (exit_code == SVM_EXIT_HLT) + return halt_interception(svm); + else if (exit_code == SVM_EXIT_NPF) + return npf_interception(svm); +#endif return svm_exit_handlers[exit_code](svm); } @@ -5086,8 +5112,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (svm_nested_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -5104,9 +5129,9 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } -static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool svm_get_enable_apicv(struct kvm *kvm) { - return avic && irqchip_split(vcpu->kvm); + return avic && irqchip_split(kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -5628,7 +5653,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.timer_advance_ns) @@ -5778,7 +5803,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -5887,6 +5912,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES); + /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); @@ -5930,13 +5958,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) if (npt_enabled) entry->edx |= F(NPT); - break; - case 0x8000001F: - /* Support memory encryption cpuid if host supports it */ - if (boot_cpu_has(X86_FEATURE_SEV)) - cpuid(0x8000001f, &entry->eax, &entry->ebx, - &entry->ecx, &entry->edx); - } } @@ -5962,7 +5983,7 @@ static bool svm_mpx_supported(void) static bool svm_xsaves_supported(void) { - return false; + return boot_cpu_has(X86_FEATURE_XSAVES); } static bool svm_umip_emulated(void) @@ -6264,18 +6285,73 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int sev_flush_asids(void) +{ + int ret, error; + + /* + * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, + * so it must be guarded. + */ + down_write(&sev_deactivate_lock); + + wbinvd_on_all_cpus(); + ret = sev_guest_df_flush(&error); + + up_write(&sev_deactivate_lock); + + if (ret) + pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + + return ret; +} + +/* Must be called with the sev_bitmap_lock held */ +static bool __sev_recycle_asids(void) +{ + int pos; + + /* Check if there are any ASIDs to reclaim before performing a flush */ + pos = find_next_bit(sev_reclaim_asid_bitmap, + max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return false; + + if (sev_flush_asids()) + return false; + + bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, + max_sev_asid); + bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + + return true; +} + static int sev_asid_new(void) { + bool retry = true; int pos; + mutex_lock(&sev_bitmap_lock); + /* * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. */ +again: pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) + if (pos >= max_sev_asid) { + if (retry && __sev_recycle_asids()) { + retry = false; + goto again; + } + mutex_unlock(&sev_bitmap_lock); return -EBUSY; + } + + __set_bit(pos, sev_asid_bitmap); + + mutex_unlock(&sev_bitmap_lock); - set_bit(pos, sev_asid_bitmap); return pos + 1; } @@ -6303,7 +6379,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return 0; e_free: - __sev_asid_free(asid); + sev_asid_free(asid); return ret; } @@ -6313,12 +6389,6 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) int asid = sev_get_asid(kvm); int ret; - wbinvd_on_all_cpus(); - - ret = sev_guest_df_flush(error); - if (ret) - return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -7208,7 +7278,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr3 = svm_decache_cr3, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e76eb4f07f6c..4aea7d304beb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -10,6 +10,7 @@ #include "hyperv.h" #include "mmu.h" #include "nested.h" +#include "pmu.h" #include "trace.h" #include "x86.h" @@ -27,6 +28,16 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) +#define SET_MSR_OR_WARN(vcpu, idx, data) \ +({ \ + bool failed = kvm_set_msr(vcpu, idx, data); \ + if (failed) \ + pr_warn_ratelimited( \ + "%s cannot write MSR (0x%x, 0x%llx)\n", \ + __func__, idx, data); \ + failed; \ +}) + /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -257,7 +268,7 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_shadow_vmcs12 = NULL; /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -929,6 +940,57 @@ fail: return i + 1; } +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, + u32 msr_index, + u64 *data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If the L0 hypervisor stored a more accurate value for the TSC that + * does not include the time taken for emulation of the L2->L1 + * VM-exit in L0, use the more accurate value. + */ + if (msr_index == MSR_IA32_TSC) { + int index = vmx_find_msr_index(&vmx->msr_autostore.guest, + MSR_IA32_TSC); + + if (index >= 0) { + u64 val = vmx->msr_autostore.guest.val[index].value; + + *data = kvm_read_l1_tsc(vcpu, val); + return true; + } + } + + if (kvm_get_msr(vcpu, msr_index, data)) { + pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, + msr_index); + return false; + } + return true; +} + +static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, + struct vmx_msr_entry *e) +{ + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(*e), + e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(*e)); + return false; + } + if (nested_vmx_store_msr_check(vcpu, e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e->index, e->reserved); + return false; + } + return true; +} + static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u64 data; @@ -940,26 +1002,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (unlikely(i >= max_msr_list_size)) return -EINVAL; - if (kvm_vcpu_read_guest(vcpu, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return -EINVAL; - } - if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - return -EINVAL; - } - if (kvm_get_msr(vcpu, e.index, &data)) { - pr_debug_ratelimited( - "%s cannot read MSR (%u, 0x%x)\n", - __func__, i, e.index); + + if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) return -EINVAL; - } + if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), @@ -973,6 +1021,60 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 count = vmcs12->vm_exit_msr_store_count; + u64 gpa = vmcs12->vm_exit_msr_store_addr; + struct vmx_msr_entry e; + u32 i; + + for (i = 0; i < count; i++) { + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) + return false; + + if (e.index == msr_index) + return true; + } + return false; +} + +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, + u32 msr_index) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + bool in_vmcs12_store_list; + int msr_autostore_index; + bool in_autostore_list; + int last; + + msr_autostore_index = vmx_find_msr_index(autostore, msr_index); + in_autostore_list = msr_autostore_index >= 0; + in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); + + if (in_vmcs12_store_list && !in_autostore_list) { + if (autostore->nr == NR_LOADSTORE_MSRS) { + /* + * Emulated VMEntry does not fail here. Instead a less + * accurate value will be returned by + * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() + * instead of reading the value from the vmcs02 VMExit + * MSR-store area. + */ + pr_warn_ratelimited( + "Not enough msr entries in msr_autostore. Can't add msr %x\n", + msr_index); + return; + } + last = autostore->nr++; + autostore->val[last].index = msr_index; + } else if (!in_vmcs12_store_list && in_autostore_list) { + last = --autostore->nr; + autostore->val[msr_autostore_index] = autostore->val[last]; + } +} + static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long invalid_mask; @@ -1012,7 +1114,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne kvm_mmu_new_cr3(vcpu, cr3, false); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_init_mmu(vcpu, false); @@ -1024,7 +1126,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * populated by L2 differently than TLB entries populated * by L1. * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * If L0 uses EPT, L1 and L2 run with different EPTP because + * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries + * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid @@ -1034,7 +1138,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - return nested_cpu_has_ept(vmcs12) || + return enable_ept || (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } @@ -2018,7 +2122,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2073,6 +2177,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 @@ -2285,6 +2390,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); } + /* + * Make sure the msr_autostore list is up to date before we set the + * count in the vmcs02. + */ + prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -2381,9 +2493,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); - else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmx_flush_tlb(vcpu, true); /* * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -2418,6 +2527,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, entry_failure_code)) return -EINVAL; + /* + * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 + * on nested VM-Exit, which can occur without actually running L2 and + * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the + * transition to HLT instead of running L2. + */ + if (enable_ept) + vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -2430,6 +2549,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl)) + return -EINVAL; + kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; @@ -2664,6 +2788,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->host_ia32_perf_global_ctrl))) + return -EINVAL; + #ifdef CONFIG_X86_64 ia32e = !!(vcpu->arch.efer & EFER_LMA); #else @@ -2779,6 +2908,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->guest_ia32_perf_global_ctrl))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -2917,7 +3051,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2933,23 +3067,22 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * to it so we can release it later. */ if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ if (!is_error_page(page)) { vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; } } @@ -2994,6 +3127,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + return true; } /* @@ -3032,13 +3166,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail + * + * Returns: + * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_ENTRY_VMFAIL: Consistency check VMFail + * NVMX_ENTRY_VMEXIT: Consistency check VMExit + * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error */ -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3081,11 +3217,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); + if (unlikely(!nested_get_vmcs12_pages(vcpu))) + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; + return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) @@ -3149,7 +3286,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ - return 0; + return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's @@ -3165,14 +3302,14 @@ vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) - return 1; + return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; - return 1; + return NVMX_VMENTRY_VMEXIT; } /* @@ -3182,9 +3319,9 @@ vmentry_fail_vmexit: static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3244,13 +3381,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3281,6 +3414,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -3453,6 +3595,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; } @@ -3856,8 +3999,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -3976,7 +4119,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -4104,6 +4247,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (vmx->nested.l1_tpr_threshold != -1) + vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -4111,15 +4256,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); - } else if (!nested_cpu_has_ept(vmcs12) && - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb(vcpu, true); } /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -4319,6 +4460,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx; + + if (!nested_vmx_allowed(vcpu)) + return; + + vmx = to_vmx(vcpu); + if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + vmx->nested.msrs.entry_ctls_high |= + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high |= + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + } else { + vmx->nested.msrs.entry_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + } +} + static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; @@ -5758,7 +5920,7 @@ error_guest_mode: return ret; } -void nested_vmx_vcpu_setup(void) +void nested_vmx_set_vmcs_shadowing_bitmap(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); @@ -6039,23 +6201,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) init_vmcs_shadow_fields(); } - exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, - exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, - exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, - exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, - exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, - exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, - exit_handlers[EXIT_REASON_VMON] = handle_vmon, - exit_handlers[EXIT_REASON_INVEPT] = handle_invept, - exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, - exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_INVEPT] = handle_invept; + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; kvm_x86_ops->check_nested_events = vmx_check_nested_events; kvm_x86_ops->get_nested_state = vmx_get_nested_state; kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, + kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..fc874d4ead0f 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -6,14 +6,25 @@ #include "vmcs12.h" #include "vmx.h" +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); -void nested_vmx_vcpu_setup(void); +void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); @@ -22,6 +33,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { @@ -245,7 +257,7 @@ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) return ((val & fixed1) | fixed0) == val; } -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -259,7 +271,7 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -267,7 +279,7 @@ static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 3e9c059099e9..7023138b1cb0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -15,6 +15,7 @@ #include "x86.h" #include "cpuid.h" #include "lapic.h" +#include "nested.h" #include "pmu.h" static struct kvm_event_hw_type_mapping intel_arch_events[] = { @@ -46,6 +47,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) if (old_ctrl == new_ctrl) continue; + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); reprogram_fixed_counter(pmc, new_ctrl, i); } @@ -111,7 +113,7 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -122,8 +124,8 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) (fixed && idx >= pmu->nr_arch_fixed_counters); } -static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx, u64 *mask) +static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -162,6 +164,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) return ret; } +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, msr); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0); + + return pmc; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -223,7 +237,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_CTRL: if (pmu->global_ctrl == data) return 0; - if (!(data & pmu->global_ctrl_mask)) { + if (kvm_valid_perf_global_ctrl(pmu, data)) { global_ctrl_changed(pmu, data); return 0; } @@ -317,6 +331,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + + bitmap_set(pmu->all_valid_pmc_idx, + 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, + INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); + + nested_vmx_pmu_entry_exit_ctls_update(vcpu); } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -328,12 +349,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].current_config = 0; } } @@ -366,8 +389,9 @@ struct kvm_pmu_ops intel_pmu_ops = { .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, .is_valid_msr = intel_is_valid_msr, .get_msr = intel_pmu_get_msr, .set_msr = intel_pmu_set_msr, diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 751a384c2eb0..81ada2ce99e7 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -43,7 +43,7 @@ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump * to vmx_vmexit. */ -ENTRY(vmx_vmenter) +SYM_FUNC_START(vmx_vmenter) /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ je 2f @@ -65,7 +65,7 @@ ENTRY(vmx_vmenter) _ASM_EXTABLE(1b, 5b) _ASM_EXTABLE(2b, 5b) -ENDPROC(vmx_vmenter) +SYM_FUNC_END(vmx_vmenter) /** * vmx_vmexit - Handle a VMX VM-Exit @@ -77,7 +77,7 @@ ENDPROC(vmx_vmenter) * here after hardware loads the host's state, i.e. this is the destination * referred to by VMCS.HOST_RIP. */ -ENTRY(vmx_vmexit) +SYM_FUNC_START(vmx_vmexit) #ifdef CONFIG_RETPOLINE ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE /* Preserve guest's RAX, it's used to stuff the RSB. */ @@ -90,7 +90,7 @@ ENTRY(vmx_vmexit) .Lvmexit_skip_rsb: #endif ret -ENDPROC(vmx_vmexit) +SYM_FUNC_END(vmx_vmexit) /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode @@ -101,7 +101,7 @@ ENDPROC(vmx_vmexit) * Returns: * 0 on VM-Exit, 1 on VM-Fail */ -ENTRY(__vmx_vcpu_run) +SYM_FUNC_START(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 @@ -233,4 +233,4 @@ ENTRY(__vmx_vcpu_run) /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %eax jmp 1b -ENDPROC(__vmx_vcpu_run) +SYM_FUNC_END(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e7970a2e8eae..e3394c839dea 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -106,8 +106,6 @@ module_param(enable_apicv, bool, S_IRUGO); static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); -static u64 __read_mostly host_xss; - bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); @@ -450,6 +448,7 @@ const u32 vmx_msr_index[] = { MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, }; #if IS_ENABLED(CONFIG_HYPERV) @@ -638,6 +637,23 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } +static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +{ + int ret = 0; + + u64 old_msr_data = msr->data; + msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); + preempt_enable(); + if (ret) + msr->data = old_msr_data; + } + return ret; +} + void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); @@ -726,8 +742,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; @@ -835,7 +851,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -static int find_msr(struct vmx_msrs *m, unsigned int msr) +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -869,7 +885,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -877,7 +893,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = find_msr(&m->host, msr); + i = vmx_find_msr_index(&m->host, msr); if (i < 0) return; @@ -936,12 +952,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (!entry_only) - j = find_msr(&m->host, msr); + j = vmx_find_msr_index(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || - (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { + if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -969,17 +985,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -1276,6 +1284,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1291,6 +1311,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); +after_clear_sn: + /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the @@ -1299,7 +1321,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ smp_mb__after_atomic(); - if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } @@ -1346,14 +1368,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - /* - * VM exits change the host TR limit to 0x67 after a VM - * exit. This is okay, since 0x67 covers everything except - * the IO bitmap and have have code to handle the IO bitmap - * being lost after a VM exit. - */ - BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ @@ -1412,35 +1426,44 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { + if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; + save_rflags = vmx->rmode.save_rflags; rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; } - to_vmx(vcpu)->rflags = rflags; + vmx->rflags = rflags; } - return to_vmx(vcpu)->rflags; + return vmx->rflags; } void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = vmx_get_rflags(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long old_rflags; - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; + if (enable_unrestricted_guest) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + vmx->rflags = rflags; + vmcs_writel(GUEST_RFLAGS, rflags); + return; + } + + old_rflags = vmx_get_rflags(vcpu); + vmx->rflags = rflags; + if (vmx->rmode.vm86_active) { + vmx->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); - if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) + vmx->emulation_required = emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -1677,6 +1700,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_TSC_AUX); if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; vmx->guest_msrs_ready = false; @@ -1776,6 +1802,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + goto find_shared_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1820,14 +1851,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; @@ -1878,8 +1901,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - /* Else, falls through */ + goto find_shared_msr; default: + find_shared_msr: msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; @@ -1995,6 +2019,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) + return 1; + goto find_shared_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2063,25 +2094,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - /* - * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. - */ - if (data != 0) - return 1; - vcpu->arch.ia32_xss = data; - if (vcpu->arch.ia32_xss != host_xss) - add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_XSS); - break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || vmx_rtit_ctl_check(vcpu, data) || @@ -2146,23 +2158,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - /* Else, falls through */ + goto find_shared_msr; + default: + find_shared_msr: msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); + if (msr) + ret = vmx_set_guest_msr(vmx, msr, data); + else + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2170,7 +2174,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); @@ -2182,7 +2187,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR3: + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + break; default: + WARN_ON_ONCE(1); break; } } @@ -2853,13 +2863,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; } -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -2872,8 +2875,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) + if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) return; if (is_pae_paging(vcpu)) { @@ -2895,10 +2897,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -2907,8 +2906,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | @@ -2989,6 +2988,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; + bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; @@ -3005,15 +3005,20 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); } - if (enable_unrestricted_guest || is_paging(vcpu) || - is_guest_mode(vcpu)) - guest_cr3 = kvm_read_cr3(vcpu); - else + /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */ + if (is_guest_mode(vcpu)) + update_guest_cr3 = false; + else if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + guest_cr3 = vcpu->arch.cr3; + else /* vmcs01.GUEST_CR3 is already up-to-date. */ + update_guest_cr3 = false; ept_load_pdptrs(vcpu); } - vmcs_writel(GUEST_CR3, guest_cr3); + if (update_guest_cr3) + vmcs_writel(GUEST_CR3, guest_cr3); } int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -3747,7 +3752,7 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) } } -static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool vmx_get_enable_apicv(struct kvm *kvm) { return enable_apicv; } @@ -4040,6 +4045,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + vcpu->arch.xsaves_enabled = xsaves_enabled; + if (!xsaves_enabled) exec_control &= ~SECONDARY_EXEC_XSAVES; @@ -4152,14 +4159,13 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 /* - * Sets up the vmcs for emulated real mode. + * Noting that the initialization of Guest-state Area of VMCS is in + * vmx_vcpu_reset(). */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void init_vmcs(struct vcpu_vmx *vmx) { - int i; - if (nested) - nested_vmx_vcpu_setup(); + nested_vmx_set_vmcs_shadowing_bitmap(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -4168,7 +4174,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; exec_controls_set(vmx, vmx_exec_control(vmx)); @@ -4217,21 +4222,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ @@ -4242,6 +4232,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) set_cr4_guest_host_mask(vmx); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + if (vmx_xsaves_supported()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); @@ -4344,9 +4337,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx->vcpu.arch.cr0 = cr0; vmx_set_cr0(vcpu, cr0); /* enter rmode */ @@ -4701,7 +4691,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu) +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; @@ -4973,21 +4963,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) vmcs_writel(GUEST_DR7, val); } -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_rdmsr(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wrmsr(vcpu); -} - static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); @@ -5004,11 +4979,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - static int handle_vmcall(struct kvm_vcpu *vcpu) { return kvm_emulate_hypercall(vcpu); @@ -5543,14 +5513,6 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } -static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", - vmcs_read32(VM_EXIT_REASON)); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5564,11 +5526,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_CPUID] = kvm_emulate_cpuid, + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, @@ -5602,15 +5564,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, - [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, - [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, - [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -5945,9 +5903,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) + && kvm_vmx_exit_handlers[exit_reason]) { +#ifdef CONFIG_RETPOLINE + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); +#endif return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { + } else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); dump_vmcs(); @@ -6033,17 +6005,17 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int tpr_threshold; if (is_guest_mode(vcpu) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; + if (is_guest_mode(vcpu)) + to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; + else + vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -6157,7 +6129,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); @@ -6187,7 +6159,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { - return pi_test_on(vcpu_to_pi_desc(vcpu)); + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -6517,9 +6492,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); @@ -6542,7 +6517,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && @@ -6649,7 +6624,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6691,7 +6666,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); @@ -6703,7 +6677,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) int err; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int cpu; + int i, cpu; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, "struct kvm_vcpu must be at offset 0 for arch usercopy region"); @@ -6748,16 +6722,39 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) - > PAGE_SIZE); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); - if (!vmx->guest_msrs) - goto free_pml; + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + switch (index) { + case MSR_IA32_TSX_CTRL: + /* + * No need to pass TSX_CTRL_CPUID_CLEAR through, so + * let's avoid changing CPUID bits under the host + * kernel's feet. + */ + vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + break; + default: + vmx->guest_msrs[j].mask = -1ull; + break; + } + ++vmx->nmsrs; + } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) - goto free_msrs; + goto free_pml; msr_bitmap = vmx->vmcs01.msr_bitmap; vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); @@ -6779,7 +6776,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); + init_vmcs(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { @@ -6819,8 +6816,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); -free_msrs: - kfree(vmx->guest_msrs); free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: @@ -6999,6 +6994,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); #undef cr4_fixed1_update } @@ -7093,6 +7089,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ + vcpu->arch.xsaves_enabled = false; + if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); vmcs_set_secondary_exec_control(vmx); @@ -7100,10 +7099,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX); if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); @@ -7113,6 +7114,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); + + if (boot_cpu_has(X86_FEATURE_RTM)) { + struct shared_msr_entry *msr; + msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + if (msr) { + bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + } + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7601,9 +7611,6 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7784,7 +7791,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bee16687dc0b..a4f7f737c5d4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -22,11 +22,17 @@ extern u32 get_umwait_control_msr(void); #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) -#define NR_AUTOLOAD_MSRS 8 +#ifdef CONFIG_X86_64 +#define NR_SHARED_MSRS 7 +#else +#define NR_SHARED_MSRS 4 +#endif + +#define NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; }; struct shared_msr_entry { @@ -167,6 +173,9 @@ struct nested_vmx { u64 vmcs01_debugctl; u64 vmcs01_guest_bndcfgs; + /* to migrate it to L1 if L2 writes to L1's CR8 directly */ + int l1_tpr_threshold; + u16 vpid02; u16 last_vpid; @@ -203,7 +212,7 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry *guest_msrs; + struct shared_msr_entry guest_msrs[NR_SHARED_MSRS]; int nmsrs; int save_nmsrs; bool guest_msrs_ready; @@ -230,6 +239,10 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; + struct msr_autostore { + struct vmx_msrs guest; + } msr_autostore; + struct { int vm86_active; ulong save_rflags; @@ -334,6 +347,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -355,6 +369,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, @@ -373,6 +392,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 661e2bf38526..cf917139de6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,6 +68,7 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/intel_pt.h> +#include <asm/emulate_prefix.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS @@ -176,6 +177,8 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +static u64 __read_mostly host_xss; + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -213,6 +216,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -259,23 +263,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } -static void shared_msr_update(unsigned slot, u32 msr) -{ - u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); @@ -287,10 +274,16 @@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { - unsigned i; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + u64 value; + int i; - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); + for (i = 0; i < shared_msrs_global.nr; ++i) { + rdmsrl_safe(shared_msrs_global.msrs[i], &value); + smsr->values[i].host = value; + smsr->values[i].curr = value; + } } int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) @@ -299,13 +292,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -360,8 +354,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - if (!kvm_rebooting) - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -709,10 +702,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + out: return ret; @@ -722,7 +713,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs); bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - bool changed = true; int offset; gfn_t gfn; int r; @@ -730,8 +720,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) if (!is_pae_paging(vcpu)) return false; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) return true; gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; @@ -739,11 +728,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -out: + return true; - return changed; + return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; } EXPORT_SYMBOL_GPL(pdptrs_changed); @@ -812,27 +799,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } } -EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { - if (vcpu->guest_xcr0_loaded) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, host_xss); } + } -EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -984,7 +978,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); return 0; } @@ -1133,13 +1127,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1180,9 +1176,10 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1221,7 +1218,7 @@ static u32 emulated_msrs[] = { * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather - * than in msrs_to_save. + * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1240,13 +1237,14 @@ static u32 emulated_msrs[] = { MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1271,6 +1269,7 @@ static u32 msr_based_features[] = { MSR_IA32_ARCH_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; static u64 kvm_get_arch_capabilities(void) @@ -1281,6 +1280,14 @@ static u64 kvm_get_arch_capabilities(void) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us @@ -1299,6 +1306,17 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + /* + * On TAA affected systems: + * - nothing to do if TSX is disabled on the host. + * - we emulate TSX_CTRL if present on the host. + * This lets the guest use VERW to clear CPU buffers. + */ + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; + return data; } @@ -1445,8 +1463,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1521,20 +1539,25 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) } #ifdef CONFIG_X86_64 +struct pvclock_clock { + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; +}; + struct pvclock_gtod_data { seqcount_t seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - } clock; + struct pvclock_clock clock; /* extract of a clocksource struct */ + struct pvclock_clock raw_clock; /* extract of a clocksource struct */ + u64 boot_ns_raw; u64 boot_ns; u64 nsec_base; u64 wall_time_sec; + u64 monotonic_raw_nsec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1542,9 +1565,10 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; + u64 boot_ns, boot_ns_raw; boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); + boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); @@ -1555,11 +1579,20 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_clock.mask = tk->tkr_raw.mask; + vdata->raw_clock.mult = tk->tkr_raw.mult; + vdata->raw_clock.shift = tk->tkr_raw.shift; + vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; vdata->wall_time_sec = tk->xtime_sec; + vdata->boot_ns_raw = boot_ns_raw; + vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; + write_seqcount_end(&vdata->seq); } #endif @@ -1983,21 +2016,21 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) +static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, + int *mode) { long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; u64 tsc_pg_val; - switch (gtod->clock.vclock_mode) { + switch (clock->vclock_mode) { case VCLOCK_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ *mode = VCLOCK_HVCLOCK; - v = (tsc_pg_val - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (tsc_pg_val - clock->cycle_last) & + clock->mask; } else { /* TSC page invalid */ *mode = VCLOCK_NONE; @@ -2006,8 +2039,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) case VCLOCK_TSC: *mode = VCLOCK_TSC; *tsc_timestamp = read_tsc(); - v = (*tsc_timestamp - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (*tsc_timestamp - clock->cycle_last) & + clock->mask; break; default: *mode = VCLOCK_NONE; @@ -2016,10 +2049,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) if (*mode == VCLOCK_NONE) *tsc_timestamp = v = 0; - return v * gtod->clock.mult; + return v * clock->mult; } -static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) +static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2028,10 +2061,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); - ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns = gtod->monotonic_raw_nsec; + ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; - ns += gtod->boot_ns; + ns += gtod->boot_ns_raw; } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -2049,7 +2082,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -2066,7 +2099,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, tsc_timestamp)); } @@ -2537,6 +2570,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) @@ -2688,6 +2722,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + /* + * We do support PT if kvm_x86_ops->pt_supported(), but we do + * not support IA32_XSS[bit 8]. Guests will have to use + * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT + * MSRs. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; @@ -2702,8 +2750,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; - kvmclock_reset(vcpu); - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { bool tmp = (msr == MSR_KVM_SYSTEM_TIME); @@ -2717,14 +2763,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else vcpu->arch.pv_time_enabled = true; break; @@ -3018,6 +3063,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -3795,12 +3846,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - } + } + + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } @@ -4391,6 +4443,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; struct kvm_nested_state kvm_state; + int idx; r = -EINVAL; if (!kvm_x86_ops->set_nested_state) @@ -4414,7 +4467,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_GET_SUPPORTED_HV_CPUID: { @@ -4916,9 +4971,6 @@ set_identity_unlock: if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); - if (r) - goto set_irqchip_out; - r = 0; set_irqchip_out: kfree(chip); break; @@ -5093,22 +5145,26 @@ static void kvm_init_msr_list(void) { struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, - "Please update the fixed PMCs in msrs_to_save[]"); + "Please update the fixed PMCs in msrs_to_saved_all[]"); perf_get_x86_pmu_capability(&x86_pmu); - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; @@ -5136,17 +5192,17 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { if (!kvm_x86_ops->pt_supported() || - msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: - if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: - if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; } @@ -5154,34 +5210,25 @@ static void kvm_init_msr_list(void) break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -5446,6 +5493,7 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { + static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5453,7 +5501,7 @@ int handle_ud(struct kvm_vcpu *vcpu) if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && - memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { + memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); emul_type = EMULTYPE_TRAP_UD_FORCED; } @@ -6111,7 +6159,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7841,6 +7889,19 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap) +{ + cpumask_var_t cpus; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, + vcpu_bitmap, cpus); + + free_cpumask_var(cpus); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -7918,7 +7979,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) */ put_page(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) { @@ -7941,8 +8001,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -8673,8 +8737,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) goto out; - /* INITs are latched while in SMM */ - if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + /* + * KVM_MP_STATE_INIT_RECEIVED means the processor is in + * INIT state; latched init should be reported using + * KVM_SET_VCPU_EVENTS, so reject it here. + */ + if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -8766,7 +8834,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); @@ -9293,6 +9361,9 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_msr_list(); return 0; } @@ -9346,7 +9417,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; @@ -9415,7 +9486,13 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + vcpu->arch.l1tf_flush_l1d = true; + if (pmu->version && unlikely(pmu->event_count)) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } kvm_x86_ops->sched_in(vcpu, cpu); } @@ -9427,6 +9504,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9455,6 +9533,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return kvm_x86_ops->vm_init(kvm); } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); @@ -9556,6 +9639,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index dbf7442a822b..29391af8871d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -238,8 +238,7 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg) { unsigned long val = kvm_register_read(vcpu, reg); @@ -247,8 +246,7 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, } static inline void kvm_register_writel(struct kvm_vcpu *vcpu, - enum kvm_reg reg, - unsigned long val) + int reg, unsigned long val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; @@ -260,6 +258,11 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } +static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) +{ + return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu); +} + void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); @@ -366,7 +369,7 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); #endif |