diff options
38 files changed, 2883 insertions, 705 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4a900cdbc62e..0c1b9f139e4a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -982,12 +982,22 @@ memory. __u8 pad2[30]; }; -If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the -KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl. -This requests KVM to generate the contents of the hypercall page -automatically; hypercalls will be intercepted and passed to userspace -through KVM_EXIT_XEN. In this case, all of the blob size and address -fields must be zero. +If certain flags are returned from the KVM_CAP_XEN_HVM check, they may +be set in the flags field of this ioctl: + +The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate +the contents of the hypercall page automatically; hypercalls will be +intercepted and passed to userspace through KVM_EXIT_XEN. In this +ase, all of the blob size and address fields must be zero. + +The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace +will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event +channel interrupts rather than manipulating the guest's shared_info +structures directly. This, in turn, may allow KVM to enable features +such as intercepting the SCHEDOP_poll hypercall to accelerate PV +spinlock operation for the guest. Userspace may still use the ioctl +to deliver events if it was advertised, even if userspace does not +send this indication that it will always do so No other flags are currently valid in the struct kvm_xen_hvm_config. @@ -1887,22 +1897,25 @@ the future. 4.55 KVM_SET_TSC_KHZ -------------------- -:Capability: KVM_CAP_TSC_CONTROL +:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL :Architectures: x86 -:Type: vcpu ioctl +:Type: vcpu ioctl / vm ioctl :Parameters: virtual tsc_khz :Returns: 0 on success, -1 on error Specifies the tsc frequency for the virtual machine. The unit of the frequency is KHz. +If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also +be used as a vm ioctl to set the initial tsc frequency of subsequently +created vCPUs. 4.56 KVM_GET_TSC_KHZ -------------------- -:Capability: KVM_CAP_GET_TSC_KHZ +:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL :Architectures: x86 -:Type: vcpu ioctl +:Type: vcpu ioctl / vm ioctl :Parameters: none :Returns: virtual tsc-khz on success, negative value on error @@ -5216,7 +5229,25 @@ have deterministic behavior. struct { __u64 gfn; } shared_info; - __u64 pad[4]; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; } u; }; @@ -5247,6 +5278,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO KVM_XEN_ATTR_TYPE_UPCALL_VECTOR Sets the exception vector used to deliver Xen event channel upcalls. + This is the HVM-wide vector injected directly by the hypervisor + (not through the local APIC), typically configured by a guest via + HVM_PARAM_CALLBACK_IRQ. + +KVM_XEN_ATTR_TYPE_EVTCHN + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures + an outbound port number for interception of EVTCHNOP_send requests + from the guest. A given sending port number may be directed back + to a specified vCPU (by APIC ID) / port / priority on the guest, + or to trigger events on an eventfd. The vCPU and priority can be + changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call, + but other fields cannot change for a given sending port. A port + mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags + field. + +KVM_XEN_ATTR_TYPE_XEN_VERSION + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures + the 32-bit version code returned to the guest when it invokes the + XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV + Xen guests will often use this to as a dummy hypercall to trigger + event channel delivery, so responding within the kernel without + exiting to userspace is beneficial. 4.127 KVM_XEN_HVM_GET_ATTR -------------------------- @@ -5258,7 +5313,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR :Returns: 0 on success, < 0 on error Allows Xen VM attributes to be read. For the structure and types, -see KVM_XEN_HVM_SET_ATTR above. +see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN +attribute cannot be read. 4.128 KVM_XEN_VCPU_SET_ATTR --------------------------- @@ -5285,6 +5341,13 @@ see KVM_XEN_HVM_SET_ATTR above. __u64 time_blocked; __u64 time_offline; } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; } u; }; @@ -5326,6 +5389,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST or RUNSTATE_offline) to set the current accounted state as of the adjusted state_entry_time. +KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen + vCPU ID of the given vCPU, to allow timer-related VCPU operations to + be intercepted by KVM. + +KVM_XEN_VCPU_ATTR_TYPE_TIMER + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the + event channel port/priority for the VIRQ_TIMER of the vCPU, as well + as allowing a pending timer to be saved/restored. + +KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the + per-vCPU local APIC upcall vector, configured by a Xen guest with + the HVMOP_set_evtchn_upcall_vector hypercall. This is typically + used by Windows guests, and is distinct from the HVM-wide upcall + vector configured with HVM_PARAM_CALLBACK_IRQ. + + 4.129 KVM_XEN_VCPU_GET_ATTR --------------------------- @@ -5645,6 +5729,25 @@ enabled with ``arch_prctl()``, but this may change in the future. The offsets of the state save areas in struct kvm_xsave follow the contents of CPUID leaf 0xD on the host. +4.135 KVM_XEN_HVM_EVTCHN_SEND +----------------------------- + +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_irq_routing_xen_evtchn +:Returns: 0 on success, < 0 on error + + +:: + + struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; + }; + +This ioctl injects an event channel interrupt directly to the guest vCPU. 5. The kvm_run structure ======================== @@ -5985,6 +6088,7 @@ should put the acknowledged interrupt vector into the 'epr' field. #define KVM_SYSTEM_EVENT_SHUTDOWN 1 #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 + #define KVM_SYSTEM_EVENT_SEV_TERM 4 __u32 type; __u32 ndata; __u64 data[16]; @@ -6009,6 +6113,8 @@ Valid values for 'type' are: has requested a crash condition maintenance. Userspace can choose to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination. + The guest physical address of the guest's GHCB is stored in `data[0]`. If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain architecture specific information for the system-level event. Only @@ -7145,6 +7251,15 @@ The valid bits in cap.args[0] are: Additionally, when this quirk is disabled, KVM clears CPUID.01H:ECX[bit 3] if IA32_MISC_ENABLE[bit 18] is cleared. + + KVM_X86_QUIRK_FIX_HYPERCALL_INSN By default, KVM rewrites guest + VMMCALL/VMCALL instructions to match the + vendor's hypercall instruction for the + system. When this quirk is disabled, KVM + will no longer rewrite invalid guest + hypercall instructions. Executing the + incorrect hypercall instruction will + generate a #UD within the guest. =================================== ============================================ 8. Other capabilities. @@ -7622,8 +7737,9 @@ PVHVM guests. Valid flags are:: #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) - #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 2) - #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 3) + #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) + #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG ioctl is available, for the guest to set its hypercall page. @@ -7647,6 +7763,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority field set to indicate 2 level event channel delivery. +The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports +injecting event channel events directly into the guest with the +KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the +KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the +KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes. +related to event channel delivery, timers, and the XENVER_version +interception. + 8.31 KVM_CAP_PPC_MULTITCE ------------------------- diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 3c368b639c04..96e4e9842dfc 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) +KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h new file mode 100644 index 000000000000..fdfd8e06fee6 --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. + */ +KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(pmc_is_enabled) +KVM_X86_PMU_OP(pmc_idx_to_pmc) +KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) +KVM_X86_PMU_OP(msr_idx_to_pmc) +KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP(is_valid_msr) +KVM_X86_PMU_OP(get_msr) +KVM_X86_PMU_OP(set_msr) +KVM_X86_PMU_OP(refresh) +KVM_X86_PMU_OP(init) +KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) +KVM_X86_PMU_OP_OPTIONAL(cleanup) + +#undef KVM_X86_PMU_OP +#undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0c0f0e1f754..e1c695f72b8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -443,14 +443,6 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; - struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; - - /* - * Bitmap; bit set = permission fault - * Byte index: page fault error code [4:1] - * Bit index: pte permissions in ACC_* format - */ - u8 permissions[16]; /* * The pkru_mask indicates if protection key checks are needed. It @@ -460,6 +452,15 @@ struct kvm_mmu { */ u32 pkru_mask; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; + + /* + * Bitmap; bit set = permission fault + * Byte index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u8 permissions[16]; + u64 *pae_root; u64 *pml4_root; u64 *pml5_root; @@ -607,16 +608,21 @@ struct kvm_vcpu_hv { struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; - bool vcpu_info_set; - bool vcpu_time_info_set; - bool runstate_set; - struct gfn_to_hva_cache vcpu_info_cache; - struct gfn_to_hva_cache vcpu_time_info_cache; - struct gfn_to_hva_cache runstate_cache; + u8 upcall_vector; + struct gfn_to_pfn_cache vcpu_info_cache; + struct gfn_to_pfn_cache vcpu_time_info_cache; + struct gfn_to_pfn_cache runstate_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; unsigned long evtchn_pending_sel; + u32 vcpu_id; /* The Xen / ACPI vCPU ID */ + u32 timer_virq; + u64 timer_expires; /* In guest epoch */ + atomic_t timer_pending; + struct hrtimer timer; + int poll_evtchn; + struct timer_list poll_timer; }; struct kvm_vcpu_arch { @@ -753,8 +759,7 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - struct gfn_to_hva_cache pv_time; - bool pv_time_enabled; + struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; @@ -1024,9 +1029,12 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + u32 xen_version; bool long_mode; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; + struct idr evtchn_ports; + unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; enum kvm_irqchip_mode { @@ -1119,6 +1127,8 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; + u32 default_tsc_khz; + seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; @@ -1455,8 +1465,6 @@ struct kvm_x86_ops { int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); - /* pmu operations of sub-arch */ - const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); @@ -1498,6 +1506,11 @@ struct kvm_x86_ops { int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); + + /* + * Returns vCPU specific APICv inhibit reasons + */ + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { @@ -1527,6 +1540,7 @@ struct kvm_x86_init_ops { unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; + struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { @@ -1548,20 +1562,6 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> -static inline void kvm_ops_static_call_update(void) -{ -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include <asm/kvm-x86-ops.h> -#undef __KVM_X86_OP -} - #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1799,6 +1799,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); @@ -1988,6 +1989,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ - KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ + KVM_X86_QUIRK_FIX_HYPERCALL_INSN) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index f78e2b3501a1..35f222aa66bf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -382,6 +382,103 @@ do { \ #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT +#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm_volatile_goto("\n" \ + "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ + _ASM_EXTABLE_UA(1b, %l[label]) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*_ptr), \ + [old] "+a" (__old) \ + : [new] ltype (__new) \ + : "memory" \ + : label); \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) + +#ifdef CONFIG_X86_32 +#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm_volatile_goto("\n" \ + "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ + _ASM_EXTABLE_UA(1b, %l[label]) \ + : CC_OUT(z) (success), \ + "+A" (__old), \ + [ptr] "+m" (*_ptr) \ + : "b" ((u32)__new), \ + "c" ((u32)((u64)__new >> 32)) \ + : "memory" \ + : label); \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) +#endif // CONFIG_X86_32 +#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT +#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ + int __err = 0; \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm volatile("\n" \ + "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ + CC_SET(z) \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ + %[errout]) \ + : CC_OUT(z) (success), \ + [errout] "+r" (__err), \ + [ptr] "+m" (*_ptr), \ + [old] "+a" (__old) \ + : [new] ltype (__new) \ + : "memory", "cc"); \ + if (unlikely(__err)) \ + goto label; \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) + +#ifdef CONFIG_X86_32 +/* + * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error. + * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are + * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses + * both ESI and EDI for the memory operand, compilation will fail if the error + * is an input+output as there will be no register available for input. + */ +#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ + int __result; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm volatile("\n" \ + "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ + "mov $0, %%ecx\n\t" \ + "setz %%cl\n" \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \ + : [result]"=c" (__result), \ + "+A" (__old), \ + [ptr] "+m" (*_ptr) \ + : "b" ((u32)__new), \ + "c" ((u32)((u64)__new >> 32)) \ + : "memory", "cc"); \ + if (unlikely(__result < 0)) \ + goto label; \ + if (unlikely(!__result)) \ + *_old = __old; \ + likely(__result); }) +#endif // CONFIG_X86_32 +#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT + /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) @@ -474,6 +571,51 @@ do { \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT +extern void __try_cmpxchg_user_wrong_size(void); + +#ifndef CONFIG_X86_32 +#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \ + __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label) +#endif + +/* + * Force the pointer to u<size> to match the size expected by the asm helper. + * clang/LLVM compiles all cases and only discards the unused paths after + * processing errors, which breaks i386 if the pointer is an 8-byte value. + */ +#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ + bool __ret; \ + __chk_user_ptr(_ptr); \ + switch (sizeof(*(_ptr))) { \ + case 1: __ret = __try_cmpxchg_user_asm("b", "q", \ + (__force u8 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 2: __ret = __try_cmpxchg_user_asm("w", "r", \ + (__force u16 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 4: __ret = __try_cmpxchg_user_asm("l", "r", \ + (__force u32 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\ + (_nval), _label); \ + break; \ + default: __try_cmpxchg_user_wrong_size(); \ + } \ + __ret; }) + +/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */ +#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ + int __ret = -EFAULT; \ + __uaccess_begin_nospec(); \ + __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \ +_label: \ + __uaccess_end(); \ + __ret; \ + }) + /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0ffaa3156a4e..6c343c6a1855 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -543,16 +543,14 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_ACC_READ_BIT 0 #define EPT_VIOLATION_ACC_WRITE_BIT 1 #define EPT_VIOLATION_ACC_INSTR_BIT 2 -#define EPT_VIOLATION_READABLE_BIT 3 -#define EPT_VIOLATION_WRITABLE_BIT 4 -#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_RWX_SHIFT 3 +#define EPT_VIOLATION_GVA_IS_VALID_BIT 7 #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) #define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) #define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) -#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) -#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) -#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) +#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) +#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bf6e96011dfe..21614807a2cb 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -428,11 +428,12 @@ struct kvm_sync_regs { struct kvm_vcpu_events events; }; -#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) -#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) -#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) -#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b14533af7676..9b698215d261 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,7 +5,7 @@ #include <asm/ia32.h> -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) #include <asm/kvm_para.h> #endif @@ -20,7 +20,7 @@ int main(void) BLANK(); #endif -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); BLANK(); #endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a22deb58f86d..d0bb2b3fb305 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -752,6 +752,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs) } #endif +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) +bool __kvm_vcpu_is_preempted(long cpu); + +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +ASM_ENDBR +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +ASM_RET +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" +".popsection"); + +#endif + static void __init kvm_guest_init(void) { int i; @@ -764,6 +800,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; static_call_update(pv_steal_clock, kvm_steal_clock); + + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -1005,40 +1044,6 @@ static void kvm_wait(u8 *ptr, u8 val) } } -#ifdef CONFIG_X86_32 -__visible bool __kvm_vcpu_is_preempted(long cpu) -{ - struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - - return !!(src->preempted & KVM_VCPU_PREEMPTED); -} -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); - -#else - -#include <asm/asm-offsets.h> - -extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); - -/* - * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and - * restoring to/from the stack. - */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); - -#endif - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -1082,10 +1087,6 @@ void __init kvm_spinlock_init(void) pv_ops.lock.wait = kvm_wait; pv_ops.lock.kick = kvm_kick_cpu; - if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_ops.lock.vcpu_is_preempted = - PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); - } /* * When PV spinlock is enabled which is preferred over * virt_spin_lock(), virt_spin_lock_key's value is meaningless. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index c5caa7311bd8..16333ba1904b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { - if (!kvm_para_available() || !kvmclock) + if (!kvm_para_available() || !kvmclock || nopv) return 0; kvmclock_init_mem(); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index be99dc86293d..e1bb6218bb96 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm) */ irq2 = 7; intno = s->pics[1].irq_base + irq2; - irq = irq2 + 8; } else intno = s->pics[0].irq_base + irq; } else { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 172b05343cfd..f371f1292ca3 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -22,10 +22,14 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { + int r = 0; + if (lapic_in_kernel(vcpu)) - return apic_has_pending_timer(vcpu); + r = apic_has_pending_timer(vcpu); + if (kvm_xen_timer_enabled(vcpu)) + r += kvm_xen_has_pending_timer(vcpu); - return 0; + return r; } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); @@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) kvm_inject_apic_timer_irqs(vcpu); + if (kvm_xen_timer_enabled(vcpu)) + kvm_xen_inject_timer_irqs(vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6e0dab04320e..0687162c4f22 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (!level) return -1; - return kvm_xen_set_evtchn_fast(e, kvm); + return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); #endif default: break; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64a2a7e2be90..77785587332e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1866,17 +1866,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else -static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int ret = vcpu->arch.mmu->sync_page(vcpu, sp); - if (ret < 0) { + if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - return !!ret; + return ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1998,7 +1995,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, for_each_sp(pages, sp, parents, i) { kvm_unlink_unsync_page(vcpu->kvm, sp); - flush |= kvm_sync_page(vcpu, sp, &invalid_list); + flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { @@ -2039,6 +2036,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; + int ret; int collisions = 0; LIST_HEAD(invalid_list); @@ -2091,11 +2089,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ - if (!kvm_sync_page(vcpu, sp, &invalid_list)) + ret = kvm_sync_page(vcpu, sp, &invalid_list); + if (ret < 0) break; WARN_ON(!list_empty(&invalid_list)); - kvm_flush_remote_tlbs(vcpu->kvm); + if (ret > 0) + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 01fee5f67ac3..7d4377f1ef2a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); } -static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t __user *ptep_user, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - signed char r; - - if (!user_access_begin(ptep_user, sizeof(pt_element_t))) - return -EFAULT; - -#ifdef CMPXCHG - asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" - "setnz %b[r]\n" - "2:" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) - : [ptr] "+m" (*ptep_user), - [old] "+a" (orig_pte), - [r] "=q" (r) - : [new] "r" (new_pte) - : "memory"); -#else - asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" - "setnz %b[r]\n" - "2:" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) - : [ptr] "+m" (*ptep_user), - [old] "+A" (orig_pte), - [r] "=q" (r) - : [new_lo] "b" ((u32)new_pte), - [new_hi] "c" ((u32)(new_pte >> 32)) - : "memory"); -#endif - - user_access_end(); - return r; -} - static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) @@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (unlikely(!walker->pte_writable[level - 1])) continue; - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); if (ret) return ret; @@ -515,14 +479,21 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= 0x180; + vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); if (write_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; - vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; + + /* + * Note, pte_access holds the raw RWX bits from the EPTE, not + * ACC_*_MASK flags! + */ + vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << + EPT_VIOLATION_RWX_SHIFT; } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eca39f56c231..618f529f1c4d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -49,6 +49,32 @@ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters */ +static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; + +#define KVM_X86_PMU_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ + *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> + +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) +{ + memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); + +#define __KVM_X86_PMU_OP(func) \ + static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); +#define KVM_X86_PMU_OP(func) \ + WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) +#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> +#undef __KVM_X86_PMU_OP +} + +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +{ + return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); +} + static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -213,7 +239,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -263,7 +289,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), + static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi); @@ -272,7 +298,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); if (!pmc) return; @@ -294,7 +320,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, pmu->reprogram_pmi); @@ -316,7 +342,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); + return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -366,7 +392,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; @@ -382,22 +408,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - if (kvm_x86_ops.pmu_ops->deliver_pmi) - kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); + static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || - kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); + return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -405,13 +430,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -420,7 +445,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_x86_ops.pmu_ops->refresh(vcpu); + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -428,7 +453,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - kvm_x86_ops.pmu_ops->reset(vcpu); + static_call(kvm_x86_pmu_reset)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -436,7 +461,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_x86_ops.pmu_ops->init(vcpu); + static_call(kvm_x86_pmu_init)(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; @@ -468,14 +493,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } - if (kvm_x86_ops.pmu_ops->cleanup) - kvm_x86_ops.pmu_ops->cleanup(vcpu); + static_call_cond(kvm_x86_pmu_cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -505,7 +529,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int config; pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); pmc->eventsel = old_eventsel; return config == perf_hw_id; } @@ -533,7 +557,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) int i; for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) continue; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9e66fba1d6a3..2a53b6c9495c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,8 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); }; +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) return pmc->type == KVM_PMC_FIXED; } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) -{ - return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc); -} - static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 421619540ff9..9b859218af59 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -165,9 +165,8 @@ free_avic: return err; } -void avic_init_vmcb(struct vcpu_svm *svm) +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { - struct vmcb *vmcb = svm->vmcb; struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); @@ -357,6 +356,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) return 1; } +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return APICV_INHIBIT_REASON_NESTED; + return 0; +} + static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96bab464967f..caa691229b71 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -36,40 +36,43 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + if (vmcb->control.exit_code != SVM_EXIT_NPF) { /* * TODO: track the cause of the nested page fault, and * correctly fill in the high bits of exit_info_1. */ - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = (1ULL << 32); - svm->vmcb->control.exit_info_2 = fault->address; + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_2 = fault->address; } - svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; - svm->vmcb->control.exit_info_1 |= fault->error_code; + vmcb->control.exit_info_1 &= ~0xffffffffULL; + vmcb->control.exit_info_1 |= fault->error_code; nested_svm_vmexit(svm); } static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(!is_guest_mode(vcpu)); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + WARN_ON(!is_guest_mode(vcpu)); if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - nested_svm_vmexit(svm); - } else { - kvm_inject_page_fault(vcpu, fault); - } + !svm->nested.nested_run_pending) { + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = fault->error_code; + vmcb->control.exit_info_2 = fault->address; + nested_svm_vmexit(svm); + } else { + kvm_inject_page_fault(vcpu, fault); + } } static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) @@ -121,6 +124,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } +static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) +{ + if (!svm->v_vmload_vmsave_enabled) + return true; + + if (!nested_npt_enabled(svm)) + return true; + + if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) + return true; + + return false; +} + void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; @@ -162,8 +179,17 @@ void recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); - vmcb_set_intercept(c, INTERCEPT_VMLOAD); - vmcb_set_intercept(c, INTERCEPT_VMSAVE); + if (nested_vmcb_needs_vls_intercept(svm)) { + /* + * If the virtual VMLOAD/VMSAVE is not enabled for the L2, + * we must intercept these instructions to correctly + * emulate them in case L1 doesn't intercept them. + */ + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); + } else { + WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); + } } /* @@ -413,6 +439,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) */ mask &= ~V_IRQ_MASK; } + + if (nested_vgif_enabled(svm)) + mask |= V_GIF_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -454,11 +484,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, vmcb12->control.exit_int_info = exit_int_info; } -static inline bool nested_npt_enabled(struct vcpu_svm *svm) -{ - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; -} - static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { /* @@ -515,6 +540,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool new_vmcb12 = false; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; nested_vmcb02_compute_g_pat(svm); @@ -526,18 +553,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.cpl = vmcb12->save.cpl; - vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + vmcb02->save.es = vmcb12->save.es; + vmcb02->save.cs = vmcb12->save.cs; + vmcb02->save.ss = vmcb12->save.ss; + vmcb02->save.ds = vmcb12->save.ds; + vmcb02->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(vmcb02, VMCB_SEG); } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; - vmcb_mark_dirty(svm->vmcb, VMCB_DT); + vmcb02->save.gdtr = vmcb12->save.gdtr; + vmcb02->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(vmcb02, VMCB_DT); } kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); @@ -554,47 +581,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = vmcb12->save.rax; - svm->vmcb->save.rsp = vmcb12->save.rsp; - svm->vmcb->save.rip = vmcb12->save.rip; + vmcb02->save.rax = vmcb12->save.rax; + vmcb02->save.rsp = vmcb12->save.rsp; + vmcb02->save.rip = vmcb12->save.rip; /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; - vmcb_mark_dirty(svm->vmcb, VMCB_DR); + vmcb_mark_dirty(vmcb02, VMCB_DR); + } + + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + /* + * Reserved bits of DEBUGCTL are ignored. Be consistent with + * svm_set_msr's definition of reserved bits. + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; + svm_update_lbrv(&svm->vcpu); + + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb02, vmcb01); } } static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { - const u32 int_ctl_vmcb01_bits = - V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; - - const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; + u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - /* - * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, - * avic_physical_id. - */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + else + int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); /* Copied from vmcb01. msrpm_base can be overwritten later. */ - svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; - svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; - svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; + vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; + vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* nested_cr3. */ if (nested_npt_enabled(svm)) @@ -605,21 +644,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->nested.ctl.tsc_offset, svm->tsc_ratio_msr); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } - svm->vmcb->control.int_ctl = + vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | - (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); - - svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; - svm->vmcb->control.int_state = svm->nested.ctl.int_state; - svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; - svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); + + vmcb02->control.int_vector = svm->nested.ctl.int_vector; + vmcb02->control.int_state = svm->nested.ctl.int_state; + vmcb02->control.event_inj = svm->nested.ctl.event_inj; + vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + + vmcb02->control.virt_ext = vmcb01->control.virt_ext & + LBR_CTL_ENABLE_MASK; + if (svm->lbrv_enabled) + vmcb02->control.virt_ext |= + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + if (kvm_pause_in_guest(svm->vcpu.kvm)) { + /* use guest values since host doesn't use them */ + vmcb02->control.pause_filter_count = + svm->pause_filter_enabled ? + svm->nested.ctl.pause_filter_count : 0; + + vmcb02->control.pause_filter_thresh = + svm->pause_threshold_enabled ? + svm->nested.ctl.pause_filter_thresh : 0; + + } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + /* use host values when guest doesn't use them */ + vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; + vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; + } else { + /* + * Intercept every PAUSE otherwise and + * ignore both host and guest values + */ + vmcb02->control.pause_filter_count = 0; + vmcb02->control.pause_filter_thresh = 0; + } nested_svm_transition_tlb_flush(vcpu); @@ -688,6 +759,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, svm_set_gif(svm, true); + if (kvm_vcpu_apicv_active(vcpu)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + return 0; } @@ -698,6 +772,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct vmcb *vmcb12; struct kvm_host_map map; u64 vmcb12_gpa; + struct vmcb *vmcb01 = svm->vmcb01.ptr; if (!svm->nested.hsave_msr) { kvm_inject_gp(vcpu, 0); @@ -741,14 +816,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) * Since vmcb01 is not in use, we can use it to store some of the L1 * state. */ - svm->vmcb01.ptr->save.efer = vcpu->arch.efer; - svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); - svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; - svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); - svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + vmcb01->save.efer = vcpu->arch.efer; + vmcb01->save.cr0 = kvm_read_cr0(vcpu); + vmcb01->save.cr4 = vcpu->arch.cr4; + vmcb01->save.rflags = kvm_get_rflags(vcpu); + vmcb01->save.rip = kvm_rip_read(vcpu); if (!npt_enabled) - svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); + vmcb01->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; @@ -814,14 +889,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb12; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; int rc; - /* Triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) @@ -843,57 +916,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - vmcb12->save.es = vmcb->save.es; - vmcb12->save.cs = vmcb->save.cs; - vmcb12->save.ss = vmcb->save.ss; - vmcb12->save.ds = vmcb->save.ds; - vmcb12->save.gdtr = vmcb->save.gdtr; - vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.es = vmcb02->save.es; + vmcb12->save.cs = vmcb02->save.cs; + vmcb12->save.ss = vmcb02->save.ss; + vmcb12->save.ds = vmcb02->save.ds; + vmcb12->save.gdtr = vmcb02->save.gdtr; + vmcb12->save.idtr = vmcb02->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; vmcb12->save.cr0 = kvm_read_cr0(vcpu); vmcb12->save.cr3 = kvm_read_cr3(vcpu); - vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr2 = vmcb02->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; vmcb12->save.rflags = kvm_get_rflags(vcpu); vmcb12->save.rip = kvm_rip_read(vcpu); vmcb12->save.rsp = kvm_rsp_read(vcpu); vmcb12->save.rax = kvm_rax_read(vcpu); - vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr7 = vmcb02->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; - vmcb12->save.cpl = vmcb->save.cpl; + vmcb12->save.cpl = vmcb02->save.cpl; - vmcb12->control.int_state = vmcb->control.int_state; - vmcb12->control.exit_code = vmcb->control.exit_code; - vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; - vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; - vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + vmcb12->control.int_state = vmcb02->control.int_state; + vmcb12->control.exit_code = vmcb02->control.exit_code; + vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) - vmcb12->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; + if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count) + vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + svm_copy_lbrs(vmcb12, vmcb02); + svm_update_lbrv(vcpu); + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb01, vmcb02); + svm_update_lbrv(vcpu); + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. */ svm_set_gif(svm, false); - svm->vmcb->control.exit_int_info = 0; + vmcb01->control.exit_int_info = 0; svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; - if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { @@ -907,13 +991,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* * Restore processor state that had been saved in vmcb01 */ - kvm_set_rflags(vcpu, svm->vmcb->save.rflags); - svm_set_efer(vcpu, svm->vmcb->save.efer); - svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); - svm_set_cr4(vcpu, svm->vmcb->save.cr4); - kvm_rax_write(vcpu, svm->vmcb->save.rax); - kvm_rsp_write(vcpu, svm->vmcb->save.rsp); - kvm_rip_write(vcpu, svm->vmcb->save.rip); + kvm_set_rflags(vcpu, vmcb01->save.rflags); + svm_set_efer(vcpu, vmcb01->save.efer); + svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, vmcb01->save.cr4); + kvm_rax_write(vcpu, vmcb01->save.rax); + kvm_rsp_write(vcpu, vmcb01->save.rsp); + kvm_rip_write(vcpu, vmcb01->save.rip); svm->vcpu.arch.dr7 = DR7_FIXED_1; kvm_update_dr7(&svm->vcpu); @@ -931,7 +1015,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); + rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); if (rc) return 1; @@ -949,9 +1033,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * right now so that it an be accounted for before we execute * L1's next instruction. */ - if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF)) kvm_queue_exception(&(svm->vcpu), DB_VECTOR); + /* + * Un-inhibit the AVIC right away, so that other vCPUs can start + * to benefit from it right away. + */ + if (kvm_apicv_activated(vcpu->kvm)) + kvm_vcpu_update_apicv(vcpu); + return 0; } @@ -1162,12 +1253,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + vmcb->control.exit_code_hi = 0; if (svm->vcpu.arch.exception.has_error_code) - svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; + vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; /* * EXITINFO2 is undefined for all exception intercepts other @@ -1175,11 +1267,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) */ if (nr == PF_VECTOR) { if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; } else if (nr == DB_VECTOR) { /* See inject_pending_event. */ kvm_deliver_exception_payload(&svm->vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 24eb935b6f85..57ab4739eb19 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -319,7 +319,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } } -struct kvm_pmu_ops amd_pmu_ops = { +struct kvm_pmu_ops amd_pmu_ops __initdata = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 537aaddc852f..9ce162119b23 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2738,8 +2738,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - ret = -EINVAL; - break; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } default: /* Error, keep GHCB MSR value as-is */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd4c64b362d2..22bbd69495ad 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -172,7 +170,7 @@ static int vls = true; module_param(vls, int, 0444); /* enable/disable Virtual GIF */ -static int vgif = true; +int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ @@ -189,6 +187,9 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -790,6 +791,17 @@ static void init_msrpm_offsets(void) } } +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) +{ + to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; + to_vmcb->save.br_from = from_vmcb->save.br_from; + to_vmcb->save.br_to = from_vmcb->save.br_to; + to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; + to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; + + vmcb_mark_dirty(to_vmcb, VMCB_LBR); +} + static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -799,6 +811,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); } static void svm_disable_lbrv(struct kvm_vcpu *vcpu) @@ -810,6 +826,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them + * on nested guest entries. + */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); +} + +static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +{ + /* + * If the LBR virtualization is disabled, the LBR msrs are always + * kept in the vmcb01 to avoid copying them on nested guest entries. + * + * If nested, and the LBR virtualization is enabled/disabled, the msrs + * are moved between the vmcb01 and vmcb02 as needed. + */ + struct vmcb *vmcb = + (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? + svm->vmcb : svm->vmcb01.ptr; + + switch (index) { + case MSR_IA32_DEBUGCTLMSR: + return vmcb->save.dbgctl; + case MSR_IA32_LASTBRANCHFROMIP: + return vmcb->save.br_from; + case MSR_IA32_LASTBRANCHTOIP: + return vmcb->save.br_to; + case MSR_IA32_LASTINTFROMIP: + return vmcb->save.last_excp_from; + case MSR_IA32_LASTINTTOIP: + return vmcb->save.last_excp_to; + default: + KVM_BUG(false, svm->vcpu.kvm, + "%s: Unknown MSR 0x%x", __func__, index); + return 0; + } +} + +void svm_update_lbrv(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & + DEBUGCTLMSR_LBR; + + bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & + LBR_CTL_ENABLE_MASK); + + if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) + if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) + enable_lbrv = true; + + if (enable_lbrv == current_enable_lbrv) + return; + + if (enable_lbrv) + svm_enable_lbrv(vcpu); + else + svm_disable_lbrv(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -831,6 +908,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm) || !old) + return; + control->pause_filter_count = __grow_ple_window(old, pause_filter_count, pause_filter_count_grow, @@ -849,6 +929,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm) || !old) + return; + control->pause_filter_count = __shrink_ple_window(old, pause_filter_count, @@ -960,6 +1043,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + + svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -979,8 +1064,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; + struct vmcb *vmcb = svm->vmcb01.ptr; + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); @@ -1104,7 +1190,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); if (kvm_vcpu_apicv_active(vcpu)) - avic_init_vmcb(svm); + avic_init_vmcb(svm, vmcb); if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); @@ -1122,10 +1208,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } } - svm_hv_init_vmcb(svm->vmcb); + svm_hv_init_vmcb(vmcb); init_vmcb_after_set_cpuid(vcpu); - vmcb_mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(vmcb); enable_gif(svm); } @@ -1380,7 +1466,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* * The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); svm_set_intercept(svm, INTERCEPT_VINTR); @@ -2142,7 +2228,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * Likewise, clear the VINTR intercept, we will set it * again while processing KVM_REQ_EVENT if needed. */ - if (vgif_enabled(svm)) + if (vgif) svm_clr_intercept(svm, INTERCEPT_STGI); if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); @@ -2160,7 +2246,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * in use, we still rely on the VINTR intercept (rather than * STGI) to detect an open interrupt window. */ - if (!vgif_enabled(svm)) + if (!vgif) svm_clear_vintr(svm); } } @@ -2575,25 +2661,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; - /* - * Nobody will change the following 5 values in the VMCB so we can - * safely return them on rdmsr. They will always be 0 until LBRV is - * implemented. - */ case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm->vmcb->save.dbgctl; - break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm->vmcb->save.br_from; - break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm->vmcb->save.br_to; - break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm->vmcb->save.last_excp_from; - break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm->vmcb->save.last_excp_to; + msr_info->data = svm_get_lbr_msr(svm, msr_info->index); break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2839,12 +2912,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - svm->vmcb->save.dbgctl = data; - vmcb_mark_dirty(svm->vmcb, VMCB_LBR); - if (data & (1ULL<<0)) - svm_enable_lbrv(vcpu); + if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) + svm->vmcb->save.dbgctl = data; else - svm_disable_lbrv(vcpu); + svm->vmcb01.ptr->save.dbgctl = data; + + svm_update_lbrv(vcpu); + break; case MSR_VM_HSAVE_PA: /* @@ -2901,9 +2975,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) svm_clear_vintr(to_svm(vcpu)); /* - * For AVIC, the only reason to end up here is ExtINTs. + * If not running nested, for AVIC, the only reason to end up here is ExtINTs. * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. + * + * If running nested, still remove the VM wide AVIC inhibit to + * support case in which the interrupt window was requested when the + * vCPU was not running nested. + + * All vCPUs which run still run nested, will remain to have their + * AVIC still inhibited due to per-cpu AVIC inhibition. */ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); @@ -2914,7 +2995,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) static int pause_interception(struct kvm_vcpu *vcpu) { bool in_kernel; - /* * CPL is not made available for an SEV-ES guest, therefore * vcpu->arch.preempted_in_kernel can never be true. Just @@ -2922,8 +3002,7 @@ static int pause_interception(struct kvm_vcpu *vcpu) */ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; - if (!kvm_pause_in_guest(vcpu->kvm)) - grow_ple_window(vcpu); + grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); return kvm_skip_emulated_instruction(vcpu); @@ -3496,14 +3575,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * enabled, the STGI interception will not occur. Enable the irq * window under the assumption that the hardware will set the GIF. */ - if (vgif_enabled(svm) || gif_set(svm)) { + if (vgif || gif_set(svm)) { /* * IRQ window is not needed when AVIC is enabled, * unless we have pending ExtINT since it cannot be injected - * via AVIC. In such case, we need to temporarily disable AVIC, + * via AVIC. In such case, KVM needs to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. + * + * If running nested, AVIC is already locally inhibited + * on this vCPU, therefore there is no need to request + * the VM wide AVIC inhibition. */ - kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); + if (!is_guest_mode(vcpu)) + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); + svm_set_vintr(svm); } } @@ -3516,7 +3601,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3946,6 +4031,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); + + svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); + + svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && + guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); + + svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && + guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + + svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); svm_recalc_instruction_intercepts(vcpu, svm); @@ -3963,13 +4059,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) */ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - - /* - * Currently, AVIC does not work with nested virtualization. - * So, we disable AVIC when cpuid for SVM is set in the L1 guest. - */ - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED); } init_vmcb_after_set_cpuid(vcpu); } @@ -4224,7 +4313,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - ret = nested_svm_vmexit(svm); + ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); if (ret) return ret; @@ -4321,7 +4410,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { @@ -4605,7 +4694,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, - .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, @@ -4632,6 +4720,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, }; /* @@ -4695,6 +4784,20 @@ static __init void svm_set_cpu_caps(void) if (tsc_scaling) kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + if (vls) + kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); + if (lbrv) + kvm_cpu_cap_set(X86_FEATURE_LBRV); + + if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) + kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); + + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) + kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); + + if (vgif) + kvm_cpu_cap_set(X86_FEATURE_VGIF); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -4806,15 +4909,20 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); if (enable_apicv) { - pr_info("AVIC enabled\n"); + if (!boot_cpu_has(X86_FEATURE_AVIC)) { + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } else + pr_info("AVIC enabled\n"); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } else { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; + svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; } if (vls) { @@ -4879,6 +4987,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f77a7d2d39dd..e246793cbeae 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -33,6 +33,7 @@ #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int vgif; extern bool intercept_smi; /* @@ -231,9 +232,14 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; - /* cached guest cpuid flags for faster access */ + /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; bool tsc_scaling_enabled : 1; + bool v_vmload_vmsave_enabled : 1; + bool lbrv_enabled : 1; + bool pause_filter_enabled : 1; + bool pause_threshold_enabled : 1; + bool vgif_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) return vmcb_is_intercept(&svm->vmcb->control, bit); } -static inline bool vgif_enabled(struct vcpu_svm *svm) +static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); + return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); +} + +static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) +{ + if (!vgif) + return NULL; + + if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm)) + return svm->nested.vmcb02.ptr; + else + return svm->vmcb01.ptr; } static inline void enable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl |= V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl |= V_GIF_MASK; else svm->vcpu.arch.hflags |= HF_GIF_MASK; } static inline void disable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl &= ~V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl &= ~V_GIF_MASK; else svm->vcpu.arch.hflags &= ~HF_GIF_MASK; } static inline bool gif_set(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + return !!(vmcb->control.int_ctl & V_GIF_MASK); else return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + /* svm.c */ #define MSR_INVALID 0xffffffffU +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + extern bool dump_invalid_vmcb; u32 svm_msrpm_offset(u32 msr); u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); @@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -void avic_init_vmcb(struct vcpu_svm *svm); +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f18744f7ff82..838ac7ab5950 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3695,12 +3695,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) + struct vmcs12 *vmcs12, + u32 vm_exit_reason, u32 exit_intr_info) { u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.injected) { + /* + * Per the SDM, VM-Exits due to double and triple faults are never + * considered to occur during event delivery, even if the double/triple + * fault is the result of an escalating vectoring issue. + * + * Note, the SDM qualifies the double fault behavior with "The original + * event results in a double-fault exception". It's unclear why the + * qualification exists since exits due to double fault can occur only + * while vectoring a different exception (injected events are never + * subject to interception), i.e. there's _always_ an original event. + * + * The SDM also uses NMI as a confusing example for the "original event + * causes the VM exit directly" clause. NMI isn't special in any way, + * the same rule applies to all events that cause an exit directly. + * NMI is an odd choice for the example because NMIs can only occur on + * instruction boundaries, i.e. they _can't_ occur during vectoring. + */ + if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || + ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && + is_double_fault(exit_intr_info))) { + vmcs12->idt_vectoring_info_field = 0; + } else if (vcpu->arch.exception.injected) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -3733,6 +3755,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, idt_vectoring |= INTR_TYPE_EXT_INTR; vmcs12->idt_vectoring_info_field = idt_vectoring; + } else { + vmcs12->idt_vectoring_info_field = 0; } } @@ -4202,12 +4226,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (to_vmx(vcpu)->exit_reason.enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - - vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + /* + * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched + * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other + * exit info fields are unmodified. + */ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { vmcs12->launch_state = 1; @@ -4219,7 +4243,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Transfer the event that L0 or L1 may wanted to inject into * L2 to IDT_VECTORING_INFO_FIELD. */ - vmcs12_save_pending_event(vcpu, vmcs12); + vmcs12_save_pending_event(vcpu, vmcs12, + vm_exit_reason, exit_intr_info); + + vmcs12->vm_exit_intr_info = exit_intr_info; + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* * According to spec, there's no need to store the guest's @@ -4518,9 +4547,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); - /* Similarly, triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bc3f8512bb64..9db662399487 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -723,7 +723,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } -struct kvm_pmu_ops intel_pmu_ops = { +struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index e325c290a816..2b9d7a7e83f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -104,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info) return is_exception_n(intr_info, BP_VECTOR); } +static inline bool is_double_fault(u32 intr_info) +{ + return is_exception_n(intr_info, DF_VECTOR); +} + static inline bool is_page_fault(u32 intr_info) { return is_exception_n(intr_info, PF_VECTOR); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 04d170c4b61e..df0b70ccd289 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2444,7 +2444,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_exec_control) < 0) return -EIO; #ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING; #endif @@ -4380,7 +4380,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); - if (kvm_vcpu_apicv_active(&vmx->vcpu)) { + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -5405,9 +5405,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & - (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | - EPT_VIOLATION_EXECUTABLE)) + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) ? PFERR_PRESENT_MASK : 0; error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? @@ -7818,7 +7816,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, .pi_update_irte = vmx_pi_update_irte, @@ -8072,6 +8069,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, }; static void vmx_cleanup_l1d_flush(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 07d789b1d366..1bc67995cc8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -961,11 +961,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && - vcpu->arch.pkru != vcpu->arch.host_pkru) + vcpu->arch.pkru != vcpu->arch.host_pkru && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -974,13 +976,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { @@ -2249,14 +2253,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(system_time & 1)) - return; - - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + if (system_time & 1) { + kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); + } else { + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + } return; } @@ -2961,63 +2964,55 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v, - struct gfn_to_hva_cache *cache, - unsigned int offset) +static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; + struct pvclock_vcpu_time_info *guest_hv_clock; + unsigned long flags; - if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, - &guest_hv_clock, offset, sizeof(guest_hv_clock)))) - return; + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } - /* This VCPU is paused, but it's legal for a guest to read another + guest_hv_clock = (void *)(gpc->khva + offset); + + /* + * This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - if (guest_hv_clock.version & 1) - ++guest_hv_clock.version; /* first time write, random junk */ - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); + smp_wmb(); - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock)); + guest_hv_clock->version = ++vcpu->hv_clock.version; - smp_wmb(); + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + read_unlock_irqrestore(&gpc->lock, flags); - vcpu->hv_clock.version++; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -3106,13 +3101,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); - if (vcpu->xen.vcpu_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time)); - if (vcpu->xen.vcpu_time_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); + if (vcpu->pv_time.active) + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); + if (vcpu->xen.vcpu_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3300,7 +3295,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.pv_time_enabled = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -4284,7 +4279,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO | - KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL; + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; @@ -4331,6 +4327,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: + case KVM_CAP_VM_TSC_CONTROL: r = kvm_has_tsc_control; break; case KVM_CAP_X2APIC_API: @@ -5102,7 +5099,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -6186,7 +6183,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) continue; ret = kvm_set_guest_paused(vcpu); @@ -6513,6 +6510,15 @@ set_pit2_out: r = kvm_xen_hvm_set_attr(kvm, &xha); break; } + case KVM_XEN_HVM_EVTCHN_SEND: { + struct kvm_irq_routing_xen_evtchn uxe; + + r = -EFAULT; + if (copy_from_user(&uxe, argp, sizeof(uxe))) + goto out; + r = kvm_xen_hvm_evtchn_send(kvm, &uxe); + break; + } #endif case KVM_SET_CLOCK: r = kvm_vm_ioctl_set_clock(kvm, argp); @@ -6520,6 +6526,28 @@ set_pit2_out: case KVM_GET_CLOCK: r = kvm_vm_ioctl_get_clock(kvm, argp); break; + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + user_tsc_khz = (u32)arg; + + if (kvm_has_tsc_control && + user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); + r = 0; + + goto out; + } + case KVM_GET_TSC_KHZ: { + r = READ_ONCE(kvm->arch.default_tsc_khz); + goto out; + } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) @@ -7229,15 +7257,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, exception, &write_emultor); } -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +#define emulator_try_cmpxchg_user(t, ptr, old, new) \ + (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t)) static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -7246,12 +7267,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { - struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u64 page_line_mask; + unsigned long hva; gpa_t gpa; - char *kaddr; - bool exchanged; + int r; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -7275,31 +7295,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) + hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(addr)) goto emul_write; - kaddr = map.hva + offset_in_page(gpa); + hva += offset_in_page(gpa); switch (bytes) { case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + r = emulator_try_cmpxchg_user(u8, hva, old, new); break; case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + r = emulator_try_cmpxchg_user(u16, hva, old, new); break; case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + r = emulator_try_cmpxchg_user(u32, hva, old, new); break; case 8: - exchanged = CMPXCHG64(kaddr, old, new); + r = emulator_try_cmpxchg_user(u64, hva, old, new); break; default: BUG(); } - kvm_vcpu_unmap(vcpu, &map, true); - - if (!exchanged) + if (r < 0) + return X86EMUL_UNHANDLEABLE; + if (r) return X86EMUL_CMPXCHG_FAILED; kvm_page_track_write(vcpu, gpa, new, bytes); @@ -8789,22 +8810,22 @@ static int kvmclock_cpu_online(unsigned int cpu) static void kvm_timer_init(void) { - max_tsc_khz = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy *policy; - int cpu; - - cpu = get_cpu(); - policy = cpufreq_cpu_get(cpu); - if (policy) { - if (policy->cpuinfo.max_freq) - max_tsc_khz = policy->cpuinfo.max_freq; - cpufreq_cpu_put(policy); + max_tsc_khz = tsc_khz; + + if (IS_ENABLED(CONFIG_CPU_FREQ)) { + struct cpufreq_policy *policy; + int cpu; + + cpu = get_cpu(); + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } + put_cpu(); } - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } @@ -9089,6 +9110,14 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) +{ + ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); + ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); + + return (vm_reasons | vcpu_reasons) == 0; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) @@ -9266,6 +9295,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); + /* + * If the quirk is disabled, synthesize a #UD and let the guest pick up + * the pieces. + */ + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { + ctxt->exception.error_code_valid = false; + ctxt->exception.vector = UD_VECTOR; + ctxt->have_exception = true; + return X86EMUL_PROPAGATE_FAULT; + } + static_call(kvm_x86_patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, @@ -9763,7 +9803,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); - activate = kvm_apicv_activated(vcpu->kvm); + activate = kvm_vcpu_apicv_activated(vcpu); + if (vcpu->arch.apicv_active == activate) goto out; @@ -10166,7 +10207,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10364,6 +10405,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); + if (kvm_xen_has_pending_events(vcpu)) + kvm_xen_inject_pending_events(vcpu); + if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -11249,9 +11293,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + kvm_xen_init_vcpu(vcpu); kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); - kvm_set_tsc_khz(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); vcpu_put(vcpu); @@ -11306,6 +11351,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); + kvm_xen_destroy_vcpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); @@ -11567,6 +11613,24 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + int kvm_arch_hardware_setup(void *opaque) { struct kvm_x86_init_ops *ops = opaque; @@ -11581,8 +11645,7 @@ int kvm_arch_hardware_setup(void *opaque) if (r != 0) return r; - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - kvm_ops_static_call_update(); + kvm_ops_update(ops); kvm_register_perf_callbacks(ops->handle_intel_pt_intr); @@ -11698,6 +11761,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; kvm->arch.guest_can_read_msr_platform_info = true; kvm->arch.enable_pmu = enable_pmu; @@ -12179,6 +12243,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) return true; + if (kvm_xen_has_pending_events(vcpu)) + return true; + return false; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bf6cc25eee76..610beba35907 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -9,17 +9,25 @@ #include "x86.h" #include "xen.h" #include "hyperv.h" +#include "lapic.h" +#include <linux/eventfd.h> #include <linux/kvm_host.h> #include <linux/sched/stat.h> #include <trace/events/kvm.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/version.h> #include <xen/interface/event_channel.h> +#include <xen/interface/sched.h> #include "trace.h" +static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); +static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); + DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) @@ -102,6 +110,66 @@ out: return ret; } +void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) +{ + if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) { + struct kvm_xen_evtchn e; + + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + kvm_xen_set_evtchn(&e, vcpu->kvm); + + vcpu->arch.xen.timer_expires = 0; + atomic_set(&vcpu->arch.xen.timer_pending, 0); + } +} + +static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, + arch.xen.timer); + if (atomic_read(&vcpu->arch.xen.timer_pending)) + return HRTIMER_NORESTART; + + atomic_inc(&vcpu->arch.xen.timer_pending); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); + + return HRTIMER_NORESTART; +} + +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +{ + atomic_set(&vcpu->arch.xen.timer_pending, 0); + vcpu->arch.xen.timer_expires = guest_abs; + + if (delta_ns <= 0) { + xen_timer_callback(&vcpu->arch.xen.timer); + } else { + ktime_t ktime_now = ktime_get(); + hrtimer_start(&vcpu->arch.xen.timer, + ktime_add_ns(ktime_now, delta_ns), + HRTIMER_MODE_ABS_HARD); + } +} + +static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) +{ + hrtimer_cancel(&vcpu->arch.xen.timer); + vcpu->arch.xen.timer_expires = 0; + atomic_set(&vcpu->arch.xen.timer_pending, 0); +} + +static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) +{ + hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); + vcpu->arch.xen.timer.function = xen_timer_callback; +} + static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; @@ -133,27 +201,36 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_hva_cache *ghc = &vx->runstate_cache; - struct kvm_memslots *slots = kvm_memslots(v->kvm); - bool atomic = (state == RUNSTATE_runnable); - uint64_t state_entry_time; - int __user *user_state; - uint64_t __user *user_times; + struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; + uint64_t *user_times; + unsigned long flags; + size_t user_len; + int *user_state; kvm_xen_update_runstate(v, state); - if (!vx->runstate_set) + if (!vx->runstate_cache.active) return; - if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) && - kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len)) - return; + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) + user_len = sizeof(struct vcpu_runstate_info); + else + user_len = sizeof(struct compat_vcpu_runstate_info); - /* We made sure it fits in a single page */ - BUG_ON(!ghc->memslot); + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + user_len)) { + read_unlock_irqrestore(&gpc->lock, flags); - if (atomic) - pagefault_disable(); + /* When invoked from kvm_sched_out() we cannot sleep */ + if (state == RUNSTATE_runnable) + return; + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) + return; + + read_lock_irqsave(&gpc->lock, flags); + } /* * The only difference between 32-bit and 64-bit versions of the @@ -167,38 +244,33 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); - user_state = (int __user *)ghc->hva; - BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); - - user_times = (uint64_t __user *)(ghc->hva + - offsetof(struct compat_vcpu_runstate_info, - state_entry_time)); #ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); - - if (v->kvm->arch.xen.long_mode) - user_times = (uint64_t __user *)(ghc->hva + - offsetof(struct vcpu_runstate_info, - state_entry_time)); #endif + + user_state = gpc->khva; + + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) + user_times = gpc->khva + offsetof(struct vcpu_runstate_info, + state_entry_time); + else + user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + /* * First write the updated state_entry_time at the appropriate * location determined by 'offset'. */ - state_entry_time = vx->runstate_entry_time; - state_entry_time |= XEN_RUNSTATE_UPDATE; - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(state_entry_time)); + sizeof(user_times[0])); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(state_entry_time)); + sizeof(user_times[0])); - if (__put_user(state_entry_time, user_times)) - goto out; + user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; smp_wmb(); /* @@ -212,8 +284,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - if (__put_user(vx->current_runstate, user_state)) - goto out; + *user_state = vx->current_runstate; /* * Write the actual runstate times immediately after the @@ -228,42 +299,114 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times))) - goto out; + memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); smp_wmb(); /* * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * runstate_entry_time field. */ - state_entry_time &= ~XEN_RUNSTATE_UPDATE; - __put_user(state_entry_time, user_times); + user_times[0] &= ~XEN_RUNSTATE_UPDATE; smp_wmb(); - out: - mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + read_unlock_irqrestore(&gpc->lock, flags); - if (atomic) - pagefault_enable(); + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } -int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) +{ + struct kvm_lapic_irq irq = { }; + int r; + + irq.dest_id = v->vcpu_id; + irq.vector = v->arch.xen.upcall_vector; + irq.dest_mode = APIC_DEST_PHYSICAL; + irq.shorthand = APIC_DEST_NOSHORT; + irq.delivery_mode = APIC_DM_FIXED; + irq.level = 1; + + /* The fast version will always work for physical unicast */ + WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); +} + +/* + * On event channel delivery, the vcpu_info may not have been accessible. + * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which + * need to be marked into the vcpu_info (and evtchn_upcall_pending set). + * Do so now that we can sleep in the context of the vCPU to bring the + * page in, and refresh the pfn cache for it. + */ +void kvm_xen_inject_pending_events(struct kvm_vcpu *v) { unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); - bool atomic = in_atomic() || !task_is_running(current); - int err; + struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; + unsigned long flags; + + if (!evtchn_pending_sel) + return; + + /* + * Yes, this is an open-coded loop. But that's just what put_user() + * does anyway. Page it in and retry the instruction. We're just a + * little more honest about it. + */ + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { + read_unlock_irqrestore(&gpc->lock, flags); + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } + + /* Now gpc->khva is a valid kernel address for the vcpu_info */ + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + struct vcpu_info *vi = gpc->khva; + + asm volatile(LOCK_PREFIX "orq %0, %1\n" + "notq %0\n" + LOCK_PREFIX "andq %0, %2\n" + : "=r" (evtchn_pending_sel), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel)); + WRITE_ONCE(vi->evtchn_upcall_pending, 1); + } else { + u32 evtchn_pending_sel32 = evtchn_pending_sel; + struct compat_vcpu_info *vi = gpc->khva; + + asm volatile(LOCK_PREFIX "orl %0, %1\n" + "notl %0\n" + LOCK_PREFIX "andl %0, %2\n" + : "=r" (evtchn_pending_sel32), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel32)); + WRITE_ONCE(vi->evtchn_upcall_pending, 1); + } + read_unlock_irqrestore(&gpc->lock, flags); + + /* For the per-vCPU lapic vector, deliver it as MSI. */ + if (v->arch.xen.upcall_vector) + kvm_xen_inject_vcpu_vector(v); + + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); +} + +int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +{ + struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; + unsigned long flags; u8 rc = 0; /* * If the global upcall vector (HVMIRQ_callback_vector) is set and * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. */ - struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; - struct kvm_memslots *slots = kvm_memslots(v->kvm); - bool ghc_valid = slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot; - - unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != @@ -273,101 +416,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); - /* - * For efficiency, this mirrors the checks for using the valid - * cache in kvm_read_guest_offset_cached(), but just uses - * __get_user() instead. And falls back to the slow path. - */ - if (!evtchn_pending_sel && ghc_valid) { - /* Fast path */ - pagefault_disable(); - err = __get_user(rc, (u8 __user *)ghc->hva + offset); - pagefault_enable(); - if (!err) - return rc; - } - - /* Slow path */ + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { + read_unlock_irqrestore(&gpc->lock, flags); - /* - * This function gets called from kvm_vcpu_block() after setting the - * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately - * from a HLT. So we really mustn't sleep. If the page ended up absent - * at that point, just return 1 in order to trigger an immediate wake, - * and we'll end up getting called again from a context where we *can* - * fault in the page and wait for it. - */ - if (atomic) - return 1; + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (in_atomic() || !task_is_running(current)) + return 1; - if (!ghc_valid) { - err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); - if (err || !ghc->memslot) { + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. */ return 0; } + read_lock_irqsave(&gpc->lock, flags); } - /* - * Now we have a valid (protected by srcu) userspace HVA in - * ghc->hva which points to the struct vcpu_info. If there - * are any bits in the in-kernel evtchn_pending_sel then - * we need to write those to the guest vcpu_info and set - * its evtchn_upcall_pending flag. If there aren't any bits - * to add, we only want to *check* evtchn_upcall_pending. - */ - if (evtchn_pending_sel) { - bool long_mode = v->kvm->arch.xen.long_mode; - - if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info))) - return 0; - - if (IS_ENABLED(CONFIG_64BIT) && long_mode) { - struct vcpu_info __user *vi = (void __user *)ghc->hva; - - /* Attempt to set the evtchn_pending_sel bits in the - * guest, and if that succeeds then clear the same - * bits in the in-kernel version. */ - asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" - "\tnotq %0\n" - "\t" LOCK_PREFIX "andq %0, %2\n" - "2:\n" - _ASM_EXTABLE_UA(1b, 2b) - : "=r" (evtchn_pending_sel), - "+m" (vi->evtchn_pending_sel), - "+m" (v->arch.xen.evtchn_pending_sel) - : "0" (evtchn_pending_sel)); - } else { - struct compat_vcpu_info __user *vi = (void __user *)ghc->hva; - u32 evtchn_pending_sel32 = evtchn_pending_sel; - - /* Attempt to set the evtchn_pending_sel bits in the - * guest, and if that succeeds then clear the same - * bits in the in-kernel version. */ - asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" - "\tnotl %0\n" - "\t" LOCK_PREFIX "andl %0, %2\n" - "2:\n" - _ASM_EXTABLE_UA(1b, 2b) - : "=r" (evtchn_pending_sel32), - "+m" (vi->evtchn_pending_sel), - "+m" (v->arch.xen.evtchn_pending_sel) - : "0" (evtchn_pending_sel32)); - } - rc = 1; - unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); - - err: - user_access_end(); - - mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); - } else { - __get_user(rc, (u8 __user *)ghc->hva + offset); - } - + rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; + read_unlock_irqrestore(&gpc->lock, flags); return rc; } @@ -375,36 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { + mutex_lock(&kvm->lock); kvm->arch.xen.long_mode = !!data->u.long_mode; + mutex_unlock(&kvm->lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: + mutex_lock(&kvm->lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + mutex_unlock(&kvm->lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { + mutex_lock(&kvm->lock); kvm->arch.xen.upcall_vector = data->u.vector; + mutex_unlock(&kvm->lock); r = 0; } break; + case KVM_XEN_ATTR_TYPE_EVTCHN: + r = kvm_xen_setattr_evtchn(kvm, data); + break; + + case KVM_XEN_ATTR_TYPE_XEN_VERSION: + mutex_lock(&kvm->lock); + kvm->arch.xen.xen_version = data->u.xen_version; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } - mutex_unlock(&kvm->lock); return r; } @@ -433,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_XEN_VERSION: + data->u.xen_version = kvm->arch.xen.xen_version; + r = 0; + break; + default: break; } @@ -457,48 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.vcpu_info_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct vcpu_info)); - if (!r) { - vcpu->arch.xen.vcpu_info_set = true; + if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } + break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.vcpu_time_info_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); - if (!r) { - vcpu->arch.xen.vcpu_time_info_set = true; + if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: @@ -507,24 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.runstate_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct vcpu_runstate_info)); - if (!r) { - vcpu->arch.xen.runstate_set = true; - } break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -622,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = 0; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: + if (data->u.vcpu_id >= KVM_MAX_VCPUS) + r = -EINVAL; + else { + vcpu->arch.xen.vcpu_id = data->u.vcpu_id; + r = 0; + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + if (data->u.timer.port) { + if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { + r = -EINVAL; + break; + } + vcpu->arch.xen.timer_virq = data->u.timer.port; + kvm_xen_init_timer(vcpu); + + /* Restart the timer if it's set */ + if (data->u.timer.expires_ns) + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, + data->u.timer.expires_ns - + get_kvmclock_ns(vcpu->kvm)); + } else if (kvm_xen_timer_enabled(vcpu)) { + kvm_xen_stop_timer(vcpu); + vcpu->arch.xen.timer_virq = 0; + } + + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: + if (data->u.vector && data->u.vector < 0x10) + r = -EINVAL; + else { + vcpu->arch.xen.upcall_vector = data->u.vector; + r = 0; + } + break; + default: break; } @@ -639,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: - if (vcpu->arch.xen.vcpu_info_set) + if (vcpu->arch.xen.vcpu_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else data->u.gpa = GPA_INVALID; @@ -647,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: - if (vcpu->arch.xen.vcpu_time_info_set) + if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; else data->u.gpa = GPA_INVALID; @@ -659,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = -EOPNOTSUPP; break; } - if (vcpu->arch.xen.runstate_set) { + if (vcpu->arch.xen.runstate_cache.active) { data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; r = 0; } @@ -697,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = -EINVAL; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: + data->u.vcpu_id = vcpu->arch.xen.vcpu_id; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + data->u.timer.port = vcpu->arch.xen.timer_virq; + data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: + data->u.vector = vcpu->arch.xen.upcall_vector; + r = 0; + break; + default: break; } @@ -777,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) { - if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) + /* Only some feature flags need to be *enabled* by userspace */ + u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + + if (xhc->flags & ~permitted_flags) return -EINVAL; /* @@ -802,18 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } -void kvm_xen_init_vm(struct kvm *kvm) -{ -} - -void kvm_xen_destroy_vm(struct kvm *kvm) -{ - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); - - if (kvm->arch.xen_hvm_config.msr) - static_branch_slow_dec_deferred(&kvm_xen_enabled); -} - static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { kvm_rax_write(vcpu, result); @@ -830,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); } +static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, + evtchn_port_t *ports) +{ + struct kvm *kvm = vcpu->kvm; + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + unsigned long *pending_bits; + unsigned long flags; + bool ret = true; + int idx, i; + + read_lock_irqsave(&gpc->lock, flags); + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + goto out_rcu; + + ret = false; + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + } else { + struct compat_shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + } + + for (i = 0; i < nr_ports; i++) { + if (test_bit(ports[i], pending_bits)) { + ret = true; + break; + } + } + + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + read_unlock_irqrestore(&gpc->lock, flags); + + return ret; +} + +static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, + u64 param, u64 *r) +{ + int idx, i; + struct sched_poll sched_poll; + evtchn_port_t port, *ports; + gpa_t gpa; + + if (!longmode || !lapic_in_kernel(vcpu) || + !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + return false; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, + sizeof(sched_poll))) { + *r = -EFAULT; + return true; + } + + if (unlikely(sched_poll.nr_ports > 1)) { + /* Xen (unofficially) limits number of pollers to 128 */ + if (sched_poll.nr_ports > 128) { + *r = -EINVAL; + return true; + } + + ports = kmalloc_array(sched_poll.nr_ports, + sizeof(*ports), GFP_KERNEL); + if (!ports) { + *r = -ENOMEM; + return true; + } + } else + ports = &port; + + for (i = 0; i < sched_poll.nr_ports; i++) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, + (gva_t)(sched_poll.ports + i), + NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, + &ports[i], sizeof(port))) { + *r = -EFAULT; + goto out; + } + } + + if (sched_poll.nr_ports == 1) + vcpu->arch.xen.poll_evtchn = port; + else + vcpu->arch.xen.poll_evtchn = -1; + + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + + if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + + if (sched_poll.timeout) + mod_timer(&vcpu->arch.xen.poll_timer, + jiffies + nsecs_to_jiffies(sched_poll.timeout)); + + kvm_vcpu_halt(vcpu); + + if (sched_poll.timeout) + del_timer(&vcpu->arch.xen.poll_timer); + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + } + + vcpu->arch.xen.poll_evtchn = 0; + *r = 0; +out: + /* Really, this is only needed in case of timeout */ + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + + if (unlikely(sched_poll.nr_ports > 1)) + kfree(ports); + return true; +} + +static void cancel_evtchn_poll(struct timer_list *t) +{ + struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode, + int cmd, u64 param, u64 *r) +{ + switch (cmd) { + case SCHEDOP_poll: + if (kvm_xen_schedop_poll(vcpu, longmode, param, r)) + return true; + fallthrough; + case SCHEDOP_yield: + kvm_vcpu_on_spin(vcpu, true); + *r = 0; + return true; + default: + break; + } + + return false; +} + +struct compat_vcpu_set_singleshot_timer { + uint64_t timeout_abs_ns; + uint32_t flags; +} __attribute__((packed)); + +static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, + int vcpu_id, u64 param, u64 *r) +{ + struct vcpu_set_singleshot_timer oneshot; + s64 delta; + gpa_t gpa; + int idx; + + if (!kvm_xen_timer_enabled(vcpu)) + return false; + + switch (cmd) { + case VCPUOP_set_singleshot_timer: + if (vcpu->arch.xen.vcpu_id != vcpu_id) { + *r = -EINVAL; + return true; + } + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + /* + * The only difference for 32-bit compat is the 4 bytes of + * padding after the interesting part of the structure. So + * for a faithful emulation of Xen we have to *try* to copy + * the padding and return -EFAULT if we can't. Otherwise we + * might as well just have copied the 12-byte 32-bit struct. + */ + BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != + offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != + sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns)); + BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) != + offsetof(struct vcpu_set_singleshot_timer, flags)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != + sizeof_field(struct vcpu_set_singleshot_timer, flags)); + + if (!gpa || + kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) : + sizeof(struct compat_vcpu_set_singleshot_timer))) { + *r = -EFAULT; + return true; + } + + delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); + if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { + *r = -ETIME; + return true; + } + + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); + *r = 0; + return true; + + case VCPUOP_stop_singleshot_timer: + if (vcpu->arch.xen.vcpu_id != vcpu_id) { + *r = -EINVAL; + return true; + } + kvm_xen_stop_timer(vcpu); + *r = 0; + return true; + } + + return false; +} + +static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, + u64 *r) +{ + if (!kvm_xen_timer_enabled(vcpu)) + return false; + + if (timeout) { + uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); + int64_t delta = timeout - guest_now; + + /* Xen has a 'Linux workaround' in do_set_timer_op() which + * checks for negative absolute timeout values (caused by + * integer overflow), and for values about 13 days in the + * future (2^50ns) which would be caused by jiffies + * overflow. For those cases, it sets the timeout 100ms in + * the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep + * churning CPU time by waking it up). + */ + if (unlikely((int64_t)timeout < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { + delta = 100 * NSEC_PER_MSEC; + timeout = guest_now + delta; + } + + kvm_xen_start_timer(vcpu, timeout, delta); + } else { + kvm_xen_stop_timer(vcpu); + } + + *r = 0; + return true; +} + int kvm_xen_hypercall(struct kvm_vcpu *vcpu) { bool longmode; - u64 input, params[6]; + u64 input, params[6], r = -ENOSYS; + bool handled = false; input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -864,6 +1246,40 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) trace_kvm_xen_hypercall(input, params[0], params[1], params[2], params[3], params[4], params[5]); + switch (input) { + case __HYPERVISOR_xen_version: + if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { + r = vcpu->kvm->arch.xen.xen_version; + handled = true; + } + break; + case __HYPERVISOR_event_channel_op: + if (params[0] == EVTCHNOP_send) + handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r); + break; + case __HYPERVISOR_sched_op: + handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0], + params[1], &r); + break; + case __HYPERVISOR_vcpu_op: + handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1], + params[2], &r); + break; + case __HYPERVISOR_set_timer_op: { + u64 timeout = params[0]; + /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */ + if (!longmode) + timeout |= params[1] << 32; + handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r); + break; + } + default: + break; + } + + if (handled) + return kvm_xen_hypercall_set_result(vcpu, r); + vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; @@ -890,14 +1306,28 @@ static inline int max_evtchn_port(struct kvm *kvm) return COMPAT_EVTCHN_2L_NR_CHANNELS; } +static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) +{ + int poll_evtchn = vcpu->arch.xen.poll_evtchn; + + if ((poll_evtchn == port || poll_evtchn == -1) && + test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); + } +} + /* - * This follows the kvm_set_irq() API, so it returns: + * The return value from this function is propagated to kvm_set_irq() API, + * so it returns: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to + * + * It is also called directly from kvm_arch_set_irq_inatomic(), where the + * only check on its return value is a comparison with -EWOULDBLOCK'. */ -int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct kvm_vcpu *vcpu; @@ -905,23 +1335,29 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, unsigned long flags; int port_word_bit; bool kick_vcpu = false; - int idx; - int rc; + int vcpu_idx, idx, rc; - vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); - if (!vcpu) - return -1; + vcpu_idx = READ_ONCE(xe->vcpu_idx); + if (vcpu_idx >= 0) + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + else { + vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); + if (!vcpu) + return -EINVAL; + WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + } - if (!vcpu->arch.xen.vcpu_info_set) - return -1; + if (!vcpu->arch.xen.vcpu_info_cache.active) + return -EINVAL; - if (e->xen_evtchn.port >= max_evtchn_port(kvm)) - return -1; + if (xe->port >= max_evtchn_port(kvm)) + return -EINVAL; rc = -EWOULDBLOCK; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); + + read_lock_irqsave(&gpc->lock, flags); if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; @@ -929,12 +1365,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, struct shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; - port_word_bit = e->xen_evtchn.port / 64; + port_word_bit = xe->port / 64; } else { struct compat_shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; - port_word_bit = e->xen_evtchn.port / 32; + port_word_bit = xe->port / 32; } /* @@ -944,39 +1380,68 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, * already set, then we kick the vCPU in question to write to the * *real* evtchn_pending_sel in its own guest vcpu_info struct. */ - if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) { + if (test_and_set_bit(xe->port, pending_bits)) { rc = 0; /* It was already raised */ - } else if (test_bit(e->xen_evtchn.port, mask_bits)) { - rc = -1; /* Masked */ + } else if (test_bit(xe->port, mask_bits)) { + rc = -ENOTCONN; /* Masked */ + kvm_xen_check_poller(vcpu, xe->port); } else { - rc = 1; /* Delivered. But was the vCPU waking already? */ - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; + rc = 1; /* Delivered to the bitmap in shared_info. */ + /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ + read_unlock_irqrestore(&gpc->lock, flags); + gpc = &vcpu->arch.xen.vcpu_info_cache; + + read_lock_irqsave(&gpc->lock, flags); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + /* + * Could not access the vcpu_info. Set the bit in-kernel + * and prod the vCPU to deliver it for itself. + */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + goto out_rcu; + } + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } else { + struct compat_vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, + (unsigned long *)&vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } + + /* For the per-vCPU lapic vector, deliver it as MSI. */ + if (kick_vcpu && vcpu->arch.xen.upcall_vector) { + kvm_xen_inject_vcpu_vector(vcpu); + kick_vcpu = false; + } } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); if (kick_vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } return rc; } -/* This is the version called from kvm_set_irq() as the .set function */ -static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, - int irq_source_id, int level, bool line_status) +static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) { bool mm_borrowed = false; int rc; - if (!level) - return -1; - - rc = kvm_xen_set_evtchn_fast(e, kvm); + rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) return rc; @@ -1020,7 +1485,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; int idx; - rc = kvm_xen_set_evtchn_fast(e, kvm); + rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) break; @@ -1037,11 +1502,27 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm return rc; } +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + if (!level) + return -EINVAL; + + return kvm_xen_set_evtchn(&e->xen_evtchn, kvm); +} + +/* + * Set up an event channel interrupt from the KVM IRQ routing table. + * Used for e.g. PIRQ from passed through physical devices. + */ int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { + struct kvm_vcpu *vcpu; + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) return -EINVAL; @@ -1049,10 +1530,328 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) return -EINVAL; + /* + * Xen gives us interesting mappings from vCPU index to APIC ID, + * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs + * to find it. Do that once at setup time, instead of every time. + * But beware that on live update / live migration, the routing + * table might be reinstated before the vCPU threads have finished + * recreating their vCPUs. + */ + vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); + if (vcpu) + e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + else + e->xen_evtchn.vcpu_idx = -1; + e->xen_evtchn.port = ue->u.xen_evtchn.port; - e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu; e->xen_evtchn.priority = ue->u.xen_evtchn.priority; e->set = evtchn_set_fn; return 0; } + +/* + * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl. + */ +int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe) +{ + struct kvm_xen_evtchn e; + int ret; + + if (!uxe->port || uxe->port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e.port = uxe->port; + e.vcpu_id = uxe->vcpu; + e.vcpu_idx = -1; + e.priority = uxe->priority; + + ret = kvm_xen_set_evtchn(&e, kvm); + + /* + * None of that 'return 1 if it actually got delivered' nonsense. + * We don't care if it was masked (-ENOTCONN) either. + */ + if (ret > 0 || ret == -ENOTCONN) + ret = 0; + + return ret; +} + +/* + * Support for *outbound* event channel events via the EVTCHNOP_send hypercall. + */ +struct evtchnfd { + u32 send_port; + u32 type; + union { + struct kvm_xen_evtchn port; + struct { + u32 port; /* zero */ + struct eventfd_ctx *ctx; + } eventfd; + } deliver; +}; + +/* + * Update target vCPU or priority for a registered sending channel. + */ +static int kvm_xen_eventfd_update(struct kvm *kvm, + struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + struct evtchnfd *evtchnfd; + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + mutex_lock(&kvm->lock); + evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); + mutex_unlock(&kvm->lock); + + if (!evtchnfd) + return -ENOENT; + + /* For an UPDATE, nothing may change except the priority/vcpu */ + if (evtchnfd->type != data->u.evtchn.type) + return -EINVAL; + + /* + * Port cannot change, and if it's zero that was an eventfd + * which can't be changed either. + */ + if (!evtchnfd->deliver.port.port || + evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + mutex_lock(&kvm->lock); + evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; + if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { + evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; + evtchnfd->deliver.port.vcpu_idx = -1; + } + mutex_unlock(&kvm->lock); + return 0; +} + +/* + * Configure the target (eventfd or local port delivery) for sending on + * a given event channel. + */ +static int kvm_xen_eventfd_assign(struct kvm *kvm, + struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + struct eventfd_ctx *eventfd = NULL; + struct evtchnfd *evtchnfd = NULL; + int ret = -EINVAL; + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); + if (!evtchnfd) + return -ENOMEM; + + switch(data->u.evtchn.type) { + case EVTCHNSTAT_ipi: + /* IPI must map back to the same port# */ + if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) + goto out; /* -EINVAL */ + break; + + case EVTCHNSTAT_interdomain: + if (data->u.evtchn.deliver.port.port) { + if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) + goto out; /* -EINVAL */ + } else { + eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto out; + } + } + break; + + case EVTCHNSTAT_virq: + case EVTCHNSTAT_closed: + case EVTCHNSTAT_unbound: + case EVTCHNSTAT_pirq: + default: /* Unknown event channel type */ + goto out; /* -EINVAL */ + } + + evtchnfd->send_port = data->u.evtchn.send_port; + evtchnfd->type = data->u.evtchn.type; + if (eventfd) { + evtchnfd->deliver.eventfd.ctx = eventfd; + } else { + /* We only support 2 level event channels for now */ + if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + goto out; /* -EINVAL; */ + + evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port; + evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; + evtchnfd->deliver.port.vcpu_idx = -1; + evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; + } + + mutex_lock(&kvm->lock); + ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, + GFP_KERNEL); + mutex_unlock(&kvm->lock); + if (ret >= 0) + return 0; + + if (ret == -ENOSPC) + ret = -EEXIST; +out: + if (eventfd) + eventfd_ctx_put(eventfd); + kfree(evtchnfd); + return ret; +} + +static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) +{ + struct evtchnfd *evtchnfd; + + mutex_lock(&kvm->lock); + evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); + mutex_unlock(&kvm->lock); + + if (!evtchnfd) + return -ENOENT; + + if (kvm) + synchronize_srcu(&kvm->srcu); + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + return 0; +} + +static int kvm_xen_eventfd_reset(struct kvm *kvm) +{ + struct evtchnfd *evtchnfd; + int i; + + mutex_lock(&kvm->lock); + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); + synchronize_srcu(&kvm->srcu); + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + } + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET) + return kvm_xen_eventfd_reset(kvm); + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN) + return kvm_xen_eventfd_deassign(kvm, port); + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE) + return kvm_xen_eventfd_update(kvm, data); + if (data->u.evtchn.flags) + return -EINVAL; + + return kvm_xen_eventfd_assign(kvm, data); +} + +static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) +{ + struct evtchnfd *evtchnfd; + struct evtchn_send send; + gpa_t gpa; + int idx; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) { + *r = -EFAULT; + return true; + } + + /* The evtchn_ports idr is protected by vcpu->kvm->srcu */ + evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); + if (!evtchnfd) + return false; + + if (evtchnfd->deliver.port.port) { + int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm); + if (ret < 0 && ret != -ENOTCONN) + return false; + } else { + eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + } + + *r = 0; + return true; +} + +void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; + vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); +} + +void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_timer_enabled(vcpu)) + kvm_xen_stop_timer(vcpu); + + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache); + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); +} + +void kvm_xen_init_vm(struct kvm *kvm) +{ + idr_init(&kvm->arch.xen.evtchn_ports); +} + +void kvm_xen_destroy_vm(struct kvm *kvm) +{ + struct evtchnfd *evtchnfd; + int i; + + kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + } + idr_destroy(&kvm->arch.xen.evtchn_ports); + + if (kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index adbcc9ed59db..ee5c4ae0755c 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -15,16 +15,19 @@ extern struct static_key_false_deferred kvm_xen_enabled; int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); +void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu); int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); - -int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, +void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu); +void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu); +int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm); int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, @@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_xen_enabled.key) && - vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector) + vcpu->arch.xen.vcpu_info_cache.active && + vcpu->kvm->arch.xen.upcall_vector) return __kvm_xen_has_interrupt(vcpu); return 0; } + +static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.evtchn_pending_sel; +} + +static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) +{ + return !!vcpu->arch.xen.timer_virq; +} + +static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu)) + return atomic_read(&vcpu->arch.xen.timer_pending); + + return 0; +} + +void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu); #else static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) { @@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } +static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) +{ +} + +static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) +{ +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return false; @@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) { return 0; } + +static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu) +{ +} + +static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) +{ +} + +static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} #endif int kvm_xen_hypercall(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f9b22c4983a..252ee4a61b58 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -611,7 +611,8 @@ struct kvm_hv_sint { struct kvm_xen_evtchn { u32 port; - u32 vcpu; + u32 vcpu_id; + int vcpu_idx; u32 priority; }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6a184d260c7f..e10d131edd80 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -444,6 +444,7 @@ struct kvm_run { #define KVM_SYSTEM_EVENT_SHUTDOWN 1 #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 +#define KVM_SYSTEM_EVENT_SEV_TERM 4 __u32 type; __u32 ndata; union { @@ -1150,7 +1151,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_MEM_OP_EXTENSION 211 #define KVM_CAP_PMU_CAPABILITY 212 #define KVM_CAP_DISABLE_QUIRKS2 213 -/* #define KVM_CAP_VM_TSC_CONTROL 214 */ +#define KVM_CAP_VM_TSC_CONTROL 214 #define KVM_CAP_SYSTEM_EVENT_DATA 215 #ifdef KVM_CAP_IRQ_ROUTING @@ -1240,6 +1241,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) struct kvm_xen_hvm_config { __u32 flags; @@ -1478,7 +1480,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) /* Available with KVM_CAP_PPC_GET_PVINFO */ #define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) -/* Available with KVM_CAP_TSC_CONTROL */ +/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with +* KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) /* Available with KVM_CAP_PCI_2_3 */ @@ -1694,6 +1697,32 @@ struct kvm_xen_hvm_attr { struct { __u64 gfn; } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; __u64 pad[8]; } u; }; @@ -1702,11 +1731,17 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 #define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_HVM_EVTCHN_SEND _IOW(KVMIO, 0xd0, struct kvm_irq_routing_xen_evtchn) + #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) @@ -1724,6 +1759,13 @@ struct kvm_xen_vcpu_attr { __u64 time_blocked; __u64 time_offline; } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; } u; }; @@ -1734,6 +1776,10 @@ struct kvm_xen_vcpu_attr { #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..b19e2eeaae80 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -77,6 +77,11 @@ config CC_HAS_ASM_GOTO_OUTPUT depends on CC_HAS_ASM_GOTO def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASM_GOTO_TIED_OUTPUT + depends on CC_HAS_ASM_GOTO_OUTPUT + # Detect buggy gcc and clang, fixed in gcc-11 clang-14. + def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .\n": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null) + config TOOLS_SUPPORT_RELR def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 0b0e4402bba6..56140068b763 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -16,6 +16,7 @@ /x86_64/debug_regs /x86_64/evmcs_test /x86_64/emulator_error_test +/x86_64/fix_hypercall_test /x86_64/get_msr_index_features /x86_64/kvm_clock_test /x86_64/kvm_pv_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 681b173aa87c..af582d168621 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -48,6 +48,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test +TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features @@ -65,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test +TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c new file mode 100644 index 000000000000..1f5c32146f3d --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <linux/stringify.h> +#include <stdint.h> + +#include "apic.h" +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +static bool ud_expected; + +static void guest_ud_handler(struct ex_regs *regs) +{ + GUEST_ASSERT(ud_expected); + GUEST_DONE(); +} + +extern unsigned char svm_hypercall_insn; +static uint64_t svm_do_sched_yield(uint8_t apic_id) +{ + uint64_t ret; + + asm volatile("mov %1, %%rax\n\t" + "mov %2, %%rbx\n\t" + "svm_hypercall_insn:\n\t" + "vmmcall\n\t" + "mov %%rax, %0\n\t" + : "=r"(ret) + : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) + : "rax", "rbx", "memory"); + + return ret; +} + +extern unsigned char vmx_hypercall_insn; +static uint64_t vmx_do_sched_yield(uint8_t apic_id) +{ + uint64_t ret; + + asm volatile("mov %1, %%rax\n\t" + "mov %2, %%rbx\n\t" + "vmx_hypercall_insn:\n\t" + "vmcall\n\t" + "mov %%rax, %0\n\t" + : "=r"(ret) + : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) + : "rax", "rbx", "memory"); + + return ret; +} + +static void assert_hypercall_insn(unsigned char *exp_insn, unsigned char *obs_insn) +{ + uint32_t exp = 0, obs = 0; + + memcpy(&exp, exp_insn, sizeof(exp)); + memcpy(&obs, obs_insn, sizeof(obs)); + + GUEST_ASSERT_EQ(exp, obs); +} + +static void guest_main(void) +{ + unsigned char *native_hypercall_insn, *hypercall_insn; + uint8_t apic_id; + + apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + + if (is_intel_cpu()) { + native_hypercall_insn = &vmx_hypercall_insn; + hypercall_insn = &svm_hypercall_insn; + svm_do_sched_yield(apic_id); + } else if (is_amd_cpu()) { + native_hypercall_insn = &svm_hypercall_insn; + hypercall_insn = &vmx_hypercall_insn; + vmx_do_sched_yield(apic_id); + } else { + GUEST_ASSERT(0); + /* unreachable */ + return; + } + + GUEST_ASSERT(!ud_expected); + assert_hypercall_insn(native_hypercall_insn, hypercall_insn); + GUEST_DONE(); +} + +static void setup_ud_vector(struct kvm_vm *vm) +{ + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); +} + +static void enter_guest(struct kvm_vm *vm) +{ + struct kvm_run *run; + struct ucall uc; + + run = vcpu_state(vm, VCPU_ID); + + vcpu_run(vm, VCPU_ID); + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]); + break; + case UCALL_DONE: + return; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); + default: + TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)", + uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason)); + } +} + +static void test_fix_hypercall(void) +{ + struct kvm_vm *vm; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + setup_ud_vector(vm); + + ud_expected = false; + sync_global_to_guest(vm, ud_expected); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + enter_guest(vm); +} + +static void test_fix_hypercall_disabled(void) +{ + struct kvm_enable_cap cap = {0}; + struct kvm_vm *vm; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + setup_ud_vector(vm); + + cap.cap = KVM_CAP_DISABLE_QUIRKS2; + cap.args[0] = KVM_X86_QUIRK_FIX_HYPERCALL_INSN; + vm_enable_cap(vm, &cap); + + ud_expected = true; + sync_global_to_guest(vm, ud_expected); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + enter_guest(vm); +} + +int main(void) +{ + if (!(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { + print_skip("KVM_X86_QUIRK_HYPERCALL_INSN not supported"); + exit(KSFT_SKIP); + } + + test_fix_hypercall(); + test_fix_hypercall_disabled(); +} diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c new file mode 100644 index 000000000000..f0083d8cfe98 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_vmcall_test + * + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + * + * Xen shared_info / pvclock testing + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include <stdint.h> +#include <time.h> +#include <sched.h> +#include <signal.h> +#include <pthread.h> + +#define NR_TEST_VCPUS 20 + +static struct kvm_vm *vm; +pthread_spinlock_t create_lock; + +#define TEST_TSC_KHZ 2345678UL +#define TEST_TSC_OFFSET 200000000 + +uint64_t tsc_sync; +static void guest_code(void) +{ + uint64_t start_tsc, local_tsc, tmp; + + start_tsc = rdtsc(); + do { + tmp = READ_ONCE(tsc_sync); + local_tsc = rdtsc(); + WRITE_ONCE(tsc_sync, local_tsc); + if (unlikely(local_tsc < tmp)) + GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0); + + } while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ); + + GUEST_DONE(); +} + + +static void *run_vcpu(void *_cpu_nr) +{ + unsigned long cpu = (unsigned long)_cpu_nr; + unsigned long failures = 0; + static bool first_cpu_done; + + /* The kernel is fine, but vm_vcpu_add_default() needs locking */ + pthread_spin_lock(&create_lock); + + vm_vcpu_add_default(vm, cpu, guest_code); + + if (!first_cpu_done) { + first_cpu_done = true; + vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET); + } + + pthread_spin_unlock(&create_lock); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, cpu); + struct ucall uc; + + vcpu_run(vm, cpu); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, cpu, &uc)) { + case UCALL_DONE: + goto out; + + case UCALL_SYNC: + printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]); + failures++; + break; + + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + out: + return (void *)failures; +} + +int main(int argc, char *argv[]) +{ + if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) { + print_skip("KVM_CAP_VM_TSC_CONTROL not available"); + exit(KSFT_SKIP); + } + + vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL); + vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ); + + pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE); + pthread_t cpu_threads[NR_TEST_VCPUS]; + unsigned long cpu; + for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) + pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu); + + unsigned long failures = 0; + for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) { + void *this_cpu_failures; + pthread_join(cpu_threads[cpu], &this_cpu_failures); + failures += (unsigned long)this_cpu_failures; + } + + TEST_ASSERT(!failures, "TSC sync failed"); + pthread_spin_destroy(&create_lock); + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 865e17146815..5e6f40633ea6 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -39,12 +39,36 @@ #define EVTCHN_VECTOR 0x10 +#define EVTCHN_TEST1 15 +#define EVTCHN_TEST2 66 +#define EVTCHN_TIMER 13 + static struct kvm_vm *vm; #define XEN_HYPERCALL_MSR 0x40000000 #define MIN_STEAL_TIME 50000 +#define __HYPERVISOR_set_timer_op 15 +#define __HYPERVISOR_sched_op 29 +#define __HYPERVISOR_event_channel_op 32 + +#define SCHEDOP_poll 3 + +#define EVTCHNOP_send 4 + +#define EVTCHNSTAT_interdomain 2 + +struct evtchn_send { + u32 port; +}; + +struct sched_poll { + u32 *ports; + unsigned int nr_ports; + u64 timeout; +}; + struct pvclock_vcpu_time_info { u32 version; u32 pad0; @@ -107,15 +131,25 @@ struct { struct kvm_irq_routing_entry entries[2]; } irq_routes; +bool guest_saw_irq; + static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; vi->evtchn_upcall_pending = 0; vi->evtchn_pending_sel = 0; + guest_saw_irq = true; GUEST_SYNC(0x20); } +static void guest_wait_for_irq(void) +{ + while (!guest_saw_irq) + __asm__ __volatile__ ("rep nop" : : : "memory"); + guest_saw_irq = false; +} + static void guest_code(void) { struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; @@ -128,6 +162,8 @@ static void guest_code(void) /* Trigger an interrupt injection */ GUEST_SYNC(0); + guest_wait_for_irq(); + /* Test having the host set runstates manually */ GUEST_SYNC(RUNSTATE_runnable); GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0); @@ -168,14 +204,132 @@ static void guest_code(void) /* Now deliver an *unmasked* interrupt */ GUEST_SYNC(8); - while (!si->evtchn_pending[1]) - __asm__ __volatile__ ("rep nop" : : : "memory"); + guest_wait_for_irq(); /* Change memslots and deliver an interrupt */ GUEST_SYNC(9); - for (;;) - __asm__ __volatile__ ("rep nop" : : : "memory"); + guest_wait_for_irq(); + + /* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */ + GUEST_SYNC(10); + + guest_wait_for_irq(); + + GUEST_SYNC(11); + + /* Our turn. Deliver event channel (to ourselves) with + * EVTCHNOP_send hypercall. */ + unsigned long rax; + struct evtchn_send s = { .port = 127 }; + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_event_channel_op), + "D" (EVTCHNOP_send), + "S" (&s)); + + GUEST_ASSERT(rax == 0); + + guest_wait_for_irq(); + + GUEST_SYNC(12); + + /* Deliver "outbound" event channel to an eventfd which + * happens to be one of our own irqfds. */ + s.port = 197; + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_event_channel_op), + "D" (EVTCHNOP_send), + "S" (&s)); + + GUEST_ASSERT(rax == 0); + + guest_wait_for_irq(); + + GUEST_SYNC(13); + + /* Set a timer 100ms in the future. */ + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_set_timer_op), + "D" (rs->state_entry_time + 100000000)); + GUEST_ASSERT(rax == 0); + + GUEST_SYNC(14); + + /* Now wait for the timer */ + guest_wait_for_irq(); + + GUEST_SYNC(15); + + /* The host has 'restored' the timer. Just wait for it. */ + guest_wait_for_irq(); + + GUEST_SYNC(16); + + /* Poll for an event channel port which is already set */ + u32 ports[1] = { EVTCHN_TIMER }; + struct sched_poll p = { + .ports = ports, + .nr_ports = 1, + .timeout = 0, + }; + + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_sched_op), + "D" (SCHEDOP_poll), + "S" (&p)); + + GUEST_ASSERT(rax == 0); + + GUEST_SYNC(17); + + /* Poll for an unset port and wait for the timeout. */ + p.timeout = 100000000; + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_sched_op), + "D" (SCHEDOP_poll), + "S" (&p)); + + GUEST_ASSERT(rax == 0); + + GUEST_SYNC(18); + + /* A timer will wake the masked port we're waiting on, while we poll */ + p.timeout = 0; + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_sched_op), + "D" (SCHEDOP_poll), + "S" (&p)); + + GUEST_ASSERT(rax == 0); + + GUEST_SYNC(19); + + /* A timer wake an *unmasked* port which should wake us with an + * actual interrupt, while we're polling on a different port. */ + ports[0]++; + p.timeout = 0; + __asm__ __volatile__ ("vmcall" : + "=a" (rax) : + "a" (__HYPERVISOR_sched_op), + "D" (SCHEDOP_poll), + "S" (&p)); + + GUEST_ASSERT(rax == 0); + + guest_wait_for_irq(); + + GUEST_SYNC(20); + + /* Timer should have fired already */ + guest_wait_for_irq(); + + GUEST_SYNC(21); } static int cmp_timespec(struct timespec *a, struct timespec *b) @@ -191,9 +345,13 @@ static int cmp_timespec(struct timespec *a, struct timespec *b) else return 0; } +struct vcpu_info *vinfo; static void handle_alrm(int sig) { + if (vinfo) + printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending); + vcpu_dump(stdout, vm, VCPU_ID, 0); TEST_FAIL("IRQ delivery timed out"); } @@ -213,6 +371,7 @@ int main(int argc, char *argv[]) bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); + bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); clock_gettime(CLOCK_REALTIME, &min_ts); @@ -233,6 +392,12 @@ int main(int argc, char *argv[]) .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, .msr = XEN_HYPERCALL_MSR, }; + + /* Let the kernel know that we *will* use it for sending all + * event channels, which lets it intercept SCHEDOP_poll */ + if (do_evtchn_tests) + hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); struct kvm_xen_hvm_attr lm = { @@ -295,7 +460,7 @@ int main(int argc, char *argv[]) /* Unexpected, but not a KVM failure */ if (irq_fd[0] == -1 || irq_fd[1] == -1) - do_eventfd_tests = false; + do_evtchn_tests = do_eventfd_tests = false; } if (do_eventfd_tests) { @@ -303,13 +468,13 @@ int main(int argc, char *argv[]) irq_routes.entries[0].gsi = 32; irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN; - irq_routes.entries[0].u.xen_evtchn.port = 15; + irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1; irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID; irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; irq_routes.entries[1].gsi = 33; irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN; - irq_routes.entries[1].u.xen_evtchn.port = 66; + irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2; irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID; irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; @@ -330,7 +495,39 @@ int main(int argc, char *argv[]) sigaction(SIGALRM, &sa, NULL); } - struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); + struct kvm_xen_vcpu_attr tmr = { + .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, + .u.timer.port = EVTCHN_TIMER, + .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + .u.timer.expires_ns = 0 + }; + + if (do_evtchn_tests) { + struct kvm_xen_hvm_attr inj = { + .type = KVM_XEN_ATTR_TYPE_EVTCHN, + .u.evtchn.send_port = 127, + .u.evtchn.type = EVTCHNSTAT_interdomain, + .u.evtchn.flags = 0, + .u.evtchn.deliver.port.port = EVTCHN_TEST1, + .u.evtchn.deliver.port.vcpu = VCPU_ID + 1, + .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + /* Test migration to a different vCPU */ + inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE; + inj.u.evtchn.deliver.port.vcpu = VCPU_ID; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + inj.u.evtchn.send_port = 197; + inj.u.evtchn.deliver.eventfd.port = 0; + inj.u.evtchn.deliver.eventfd.fd = irq_fd[1]; + inj.u.evtchn.flags = 0; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + } + vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); vinfo->evtchn_upcall_pending = 0; struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR); @@ -423,7 +620,7 @@ int main(int argc, char *argv[]) goto done; if (verbose) printf("Testing masked event channel\n"); - shinfo->evtchn_mask[0] = 0x8000; + shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1; eventfd_write(irq_fd[0], 1UL); alarm(1); break; @@ -440,6 +637,9 @@ int main(int argc, char *argv[]) break; case 9: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; if (verbose) printf("Testing event channel after memslot change\n"); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -449,12 +649,153 @@ int main(int argc, char *argv[]) alarm(1); break; + case 10: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + if (!do_evtchn_tests) + goto done; + + shinfo->evtchn_pending[0] = 0; + if (verbose) + printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n"); + + struct kvm_irq_routing_xen_evtchn e; + e.port = EVTCHN_TEST2; + e.vcpu = VCPU_ID; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e); + evtchn_irq_expected = true; + alarm(1); + break; + + case 11: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; + + if (verbose) + printf("Testing guest EVTCHNOP_send direct to evtchn\n"); + evtchn_irq_expected = true; + alarm(1); + break; + + case 12: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[0] = 0; + + if (verbose) + printf("Testing guest EVTCHNOP_send to eventfd\n"); + evtchn_irq_expected = true; + alarm(1); + break; + + case 13: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; + + if (verbose) + printf("Testing guest oneshot timer\n"); + break; + + case 14: + memset(&tmr, 0, sizeof(tmr)); + tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER, + "Timer port not returned"); + TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + "Timer priority not returned"); + TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time, + "Timer expiry not returned"); + evtchn_irq_expected = true; + alarm(1); + break; + + case 15: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[0] = 0; + + if (verbose) + printf("Testing restored oneshot timer\n"); + + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + evtchn_irq_expected = true; + alarm(1); + break; + + case 16: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + if (verbose) + printf("Testing SCHEDOP_poll with already pending event\n"); + shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER; + alarm(1); + break; + + case 17: + if (verbose) + printf("Testing SCHEDOP_poll timeout\n"); + shinfo->evtchn_pending[0] = 0; + alarm(1); + break; + + case 18: + if (verbose) + printf("Testing SCHEDOP_poll wake on masked event\n"); + + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + alarm(1); + break; + + case 19: + shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0; + if (verbose) + printf("Testing SCHEDOP_poll wake on unmasked event\n"); + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + + /* Read it back and check the pending time is reported correctly */ + tmr.u.timer.expires_ns = 0; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000, + "Timer not reported pending"); + alarm(1); + break; + + case 20: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + /* Read timer and check it is no longer pending */ + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending"); + + shinfo->evtchn_pending[0] = 0; + if (verbose) + printf("Testing timer in the past\n"); + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + alarm(1); + break; + + case 21: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + goto done; + case 0x20: TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ"); evtchn_irq_expected = false; - if (shinfo->evtchn_pending[1] && - shinfo->evtchn_pending[0]) - goto done; break; } break; @@ -467,6 +808,7 @@ int main(int argc, char *argv[]) } done: + alarm(0); clock_gettime(CLOCK_REALTIME, &max_ts); /* |