From 6356ee0c9602004e0a3b4b2dad68ee2ee9385b17 Mon Sep 17 00:00:00 2001 From: Marian Rotariu Date: Mon, 30 Apr 2018 12:23:01 +0300 Subject: x86: Delay skip of emulated hypercall instruction The IP increment should be done after the hypercall emulation, after calling the various handlers. In this way, these handlers can accurately identify the the IP of the VMCALL if they need it. This patch keeps the same functionality for the Hyper-V handler which does not use the return code of the standard kvm_skip_emulated_instruction() call. Signed-off-by: Marian Rotariu [Hyper-V hypercalls also need kvm_skip_emulated_instruction() - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/x86.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 98618e397342..14dd5e5010a2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1265,7 +1265,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51ecd381793b..44bd4a23b59c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6671,12 +6671,13 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r; + int op_64_bit; - r = kvm_skip_emulated_instruction(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu->kvm)) - return kvm_hv_hypercall(vcpu); + if (kvm_hv_hypercall_enabled(vcpu->kvm)) { + if (!kvm_hv_hypercall(vcpu)) + return 0; + goto out; + } nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); @@ -6697,7 +6698,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_x86_ops->get_cpl(vcpu) != 0) { ret = -KVM_EPERM; - goto out; + goto out_error; } switch (nr) { @@ -6717,12 +6718,14 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } -out: +out_error: if (!op_64_bit) ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + +out: ++vcpu->stat.hypercalls; - return r; + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -- cgit v1.2.3 From 452a68d0ef341c4d544757e02154788227b2a08b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 7 May 2018 19:24:34 +0200 Subject: KVM: hyperv: idr_find needs RCU protection Even though the eventfd is released after the KVM SRCU grace period elapses, the conn_to_evt data structure itself is not; it uses RCU internally, instead. Fix the read-side critical section to happen under rcu_read_lock/unlock; the result is still protected by vcpu->kvm->srcu. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 14dd5e5010a2..5708e951a5c6 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1296,8 +1296,10 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) if (param & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; - /* conn_to_evt is protected by vcpu->kvm->srcu */ + /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ + rcu_read_lock(); eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; -- cgit v1.2.3 From c19986fea873f3c745122bf79013a872a190f212 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 4 May 2018 11:37:13 -0700 Subject: kvm: x86: Suppress CR3_PCID_INVD bit only when PCIDs are enabled If the PCIDE bit is not set in CR4, then the MSb of CR3 is a reserved bit. If the guest tries to set it, that should cause a #GP fault. So mask out the bit only when the PCIDE bit is set. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 44bd4a23b59c..37dd9a9d050a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -843,7 +843,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { #ifdef CONFIG_X86_64 - cr3 &= ~CR3_PCID_INVD; + bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + if (pcid_enabled) + cr3 &= ~CR3_PCID_INVD; #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { -- cgit v1.2.3 From 64f7a11586ab9262f00b8b6eceef6d8154921bd8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Apr 2018 10:01:06 -0700 Subject: KVM: vmx: update sec exec controls for UMIP iff emulating UMIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update SECONDARY_EXEC_DESC for UMIP emulation if and only UMIP is actually being emulated. Skipping the VMCS update eliminates unnecessary VMREAD/VMWRITE when UMIP is supported in hardware, and on platforms that don't have SECONDARY_VM_EXEC_CONTROL. The latter case resolves a bug where KVM would fill the kernel log with warnings due to failed VMWRITEs on older platforms. Fixes: 0367f205a3b7 ("KVM: vmx: add support for emulating UMIP") Cc: stable@vger.kernel.org #4.16 Reported-by: Paolo Zeppegno Suggested-by: Paolo Bonzini Suggested-by: Radim KrÄmář Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7668806163f..3f1696570b41 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void) SECONDARY_EXEC_ENABLE_VMFUNC; } +static bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -4761,14 +4767,16 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - hw_cr4 &= ~X86_CR4_UMIP; - } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (cr4 & X86_CR4_UMIP) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + } if (cr4 & X86_CR4_VMXE) { /* @@ -9497,12 +9505,6 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; -- cgit v1.2.3 From 4c27625b7a67eb9006963ed2bcf8e53b259b43af Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 5 May 2018 04:02:32 -0700 Subject: KVM: X86: Lower the default timer frequency limit to 200us MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anthoine reported: The period used by Windows change over time but it can be 1 milliseconds or less. I saw the limit_periodic_timer_frequency print so 500 microseconds is sometimes reached. As suggested by Paolo, lower the default timer frequency limit to a smaller interval of 200 us (5000 Hz) to leave some headroom. This is required due to Windows 10 changing the scheduler tick limit from 1024 Hz to 2048 Hz. Reported-by: Anthoine Bourgeois Suggested-by: Paolo Bonzini Reviewed-by: Darren Kenny Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Anthoine Bourgeois Cc: Darren Kenny Cc: Jan Kiszka Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 37dd9a9d050a..59371de5d722 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); static bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); -unsigned int min_timer_period_us = 500; +unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; -- cgit v1.2.3 From bf308242ab98b5d1648c3663e753556bef9bec01 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 11 May 2018 15:20:14 +0100 Subject: KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls with SRCU lock kvm_read_guest() will eventually look up in kvm_memslots(), which requires either to hold the kvm->slots_lock or to be inside a kvm->srcu critical section. In contrast to x86 and s390 we don't take the SRCU lock on every guest exit, so we have to do it individually for each kvm_read_guest() call. Provide a wrapper which does that and use that everywhere. Note that ending the SRCU critical section before returning from the kvm_read_guest() wrapper is safe, because the data has been *copied*, so we don't need to rely on valid references to the memslot anymore. Cc: Stable # 4.8+ Reported-by: Jan Glauber Signed-off-by: Andre Przywara Acked-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_mmu.h | 16 ++++++++++++++++ arch/arm64/include/asm/kvm_mmu.h | 16 ++++++++++++++++ virt/kvm/arm/vgic/vgic-its.c | 15 ++++++++------- 3 files changed, 40 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 707a1f06dc5d..f675162663f0 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -309,6 +309,22 @@ static inline unsigned int kvm_get_vmid_bits(void) return 8; } +/* + * We are not in the kvm->srcu critical section most of the time, so we take + * the SRCU read lock here. Since we copy the data from the user page, we + * can immediately drop the lock again. + */ +static inline int kvm_read_guest_lock(struct kvm *kvm, + gpa_t gpa, void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_read_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + static inline void *kvm_get_hyp_vector(void) { return kvm_ksym_ref(__kvm_hyp_vector); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 082110993647..6128992c2ded 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -360,6 +360,22 @@ static inline unsigned int kvm_get_vmid_bits(void) return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +/* + * We are not in the kvm->srcu critical section most of the time, so we take + * the SRCU read lock here. Since we copy the data from the user page, we + * can immediately drop the lock again. + */ +static inline int kvm_read_guest_lock(struct kvm *kvm, + gpa_t gpa, void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_read_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + #ifdef CONFIG_KVM_INDIRECT_VECTORS /* * EL2 vectors can be mapped and rerouted in a number of ways, diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 51a80b600632..7cb060e01a76 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -281,8 +281,8 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, int ret; unsigned long flags; - ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, - &prop, 1); + ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); if (ret) return ret; @@ -444,8 +444,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) * this very same byte in the last iteration. Reuse that. */ if (byte_offset != last_byte_offset) { - ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset, - &pendmask, 1); + ret = kvm_read_guest_lock(vcpu->kvm, + pendbase + byte_offset, + &pendmask, 1); if (ret) { kfree(intids); return ret; @@ -789,7 +790,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, return false; /* Each 1st level entry is represented by a 64-bit value. */ - if (kvm_read_guest(its->dev->kvm, + if (kvm_read_guest_lock(its->dev->kvm, BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), &indirect_ptr, sizeof(indirect_ptr))) return false; @@ -1370,8 +1371,8 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) cbaser = CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { - int ret = kvm_read_guest(kvm, cbaser + its->creadr, - cmd_buf, ITS_CMD_SIZE); + int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); /* * If kvm_read_guest() fails, this could be due to the guest * programming a bogus value in CBASER or something else going -- cgit v1.2.3 From 633711e82878dc29083fc5d2605166755e25b57a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 17 May 2018 17:54:24 +0300 Subject: kvm: rename KVM_HINTS_DEDICATED to KVM_HINTS_REALTIME KVM_HINTS_DEDICATED seems to be somewhat confusing: Guest doesn't really care whether it's the only task running on a host CPU as long as it's not preempted. And there are more reasons for Guest to be preempted than host CPU sharing, for example, with memory overcommit it can get preempted on a memory access, post copy migration can cause preemption, etc. Let's call it KVM_HINTS_REALTIME which seems to better match what guests expect. Also, the flag most be set on all vCPUs - current guests assume this. Note so in the documentation. Signed-off-by: Michael S. Tsirkin Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/cpuid.txt | 6 +++--- arch/x86/include/uapi/asm/kvm_para.h | 2 +- arch/x86/kernel/kvm.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index d4f33eb805dd..ab022dcd0911 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side flag || value || meaning ================================================================================== -KVM_HINTS_DEDICATED || 0 || guest checks this feature bit to - || || determine if there is vCPU pinning - || || and there is no vCPU over-commitment, +KVM_HINTS_REALTIME || 0 || guest checks this feature bit to + || || determine that vCPUs are never + || || preempted for an unlimited time, || || allowing optimizations ---------------------------------------------------------------------------------- diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 4c851ebb3ceb..0ede697c3961 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -29,7 +29,7 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 -#define KVM_HINTS_DEDICATED 0 +#define KVM_HINTS_REALTIME 0 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7867417cfaff..5b2300b818af 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void) static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) static_branch_disable(&virt_spin_lock_key); } @@ -553,7 +553,7 @@ static void __init kvm_guest_init(void) } if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; @@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void) int cpu; if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), @@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; - if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; __pv_init_lock_hash(); -- cgit v1.2.3