summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst152
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h31
-rw-r--r--arch/x86/include/asm/kvm_host.h68
-rw-r--r--arch/x86/include/asm/uaccess.h142
-rw-r--r--arch/x86/include/asm/vmx.h10
-rw-r--r--arch/x86/include/uapi/asm/kvm.h11
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/kvm.c77
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c18
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h49
-rw-r--r--arch/x86/kvm/pmu.c66
-rw-r--r--arch/x86/kvm/pmu.h7
-rw-r--r--arch/x86/kvm/svm/avic.c10
-rw-r--r--arch/x86/kvm/svm/nested.c300
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c8
-rw-r--r--arch/x86/kvm/svm/svm.c209
-rw-r--r--arch/x86/kvm/svm/svm.h53
-rw-r--r--arch/x86/kvm/vmx/nested.c48
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmx.c10
-rw-r--r--arch/x86/kvm/x86.c267
-rw-r--r--arch/x86/kvm/xen.c1245
-rw-r--r--arch/x86/kvm/xen.h62
-rw-r--r--include/linux/kvm_host.h3
-rw-r--r--include/uapi/linux/kvm.h50
-rw-r--r--init/Kconfig5
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile2
-rw-r--r--tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c170
-rw-r--r--tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c119
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c366
38 files changed, 2883 insertions, 705 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 4a900cdbc62e..0c1b9f139e4a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -982,12 +982,22 @@ memory.
__u8 pad2[30];
};
-If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
-KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
-This requests KVM to generate the contents of the hypercall page
-automatically; hypercalls will be intercepted and passed to userspace
-through KVM_EXIT_XEN. In this case, all of the blob size and address
-fields must be zero.
+If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
+be set in the flags field of this ioctl:
+
+The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
+the contents of the hypercall page automatically; hypercalls will be
+intercepted and passed to userspace through KVM_EXIT_XEN. In this
+ase, all of the blob size and address fields must be zero.
+
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
+will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
+channel interrupts rather than manipulating the guest's shared_info
+structures directly. This, in turn, may allow KVM to enable features
+such as intercepting the SCHEDOP_poll hypercall to accelerate PV
+spinlock operation for the guest. Userspace may still use the ioctl
+to deliver events if it was advertised, even if userspace does not
+send this indication that it will always do so
No other flags are currently valid in the struct kvm_xen_hvm_config.
@@ -1887,22 +1897,25 @@ the future.
4.55 KVM_SET_TSC_KHZ
--------------------
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
:Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
:Parameters: virtual tsc_khz
:Returns: 0 on success, -1 on error
Specifies the tsc frequency for the virtual machine. The unit of the
frequency is KHz.
+If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+be used as a vm ioctl to set the initial tsc frequency of subsequently
+created vCPUs.
4.56 KVM_GET_TSC_KHZ
--------------------
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
:Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
:Parameters: none
:Returns: virtual tsc-khz on success, negative value on error
@@ -5216,7 +5229,25 @@ have deterministic behavior.
struct {
__u64 gfn;
} shared_info;
- __u64 pad[4];
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
+ __u64 pad[8];
} u;
};
@@ -5247,6 +5278,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
+ This is the HVM-wide vector injected directly by the hypervisor
+ (not through the local APIC), typically configured by a guest via
+ HVM_PARAM_CALLBACK_IRQ.
+
+KVM_XEN_ATTR_TYPE_EVTCHN
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+ an outbound port number for interception of EVTCHNOP_send requests
+ from the guest. A given sending port number may be directed back
+ to a specified vCPU (by APIC ID) / port / priority on the guest,
+ or to trigger events on an eventfd. The vCPU and priority can be
+ changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
+ but other fields cannot change for a given sending port. A port
+ mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
+ field.
+
+KVM_XEN_ATTR_TYPE_XEN_VERSION
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+ the 32-bit version code returned to the guest when it invokes the
+ XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
+ Xen guests will often use this to as a dummy hypercall to trigger
+ event channel delivery, so responding within the kernel without
+ exiting to userspace is beneficial.
4.127 KVM_XEN_HVM_GET_ATTR
--------------------------
@@ -5258,7 +5313,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
:Returns: 0 on success, < 0 on error
Allows Xen VM attributes to be read. For the structure and types,
-see KVM_XEN_HVM_SET_ATTR above.
+see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
+attribute cannot be read.
4.128 KVM_XEN_VCPU_SET_ATTR
---------------------------
@@ -5285,6 +5341,13 @@ see KVM_XEN_HVM_SET_ATTR above.
__u64 time_blocked;
__u64 time_offline;
} runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
} u;
};
@@ -5326,6 +5389,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
or RUNSTATE_offline) to set the current accounted state as of the
adjusted state_entry_time.
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
+ vCPU ID of the given vCPU, to allow timer-related VCPU operations to
+ be intercepted by KVM.
+
+KVM_XEN_VCPU_ATTR_TYPE_TIMER
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+ event channel port/priority for the VIRQ_TIMER of the vCPU, as well
+ as allowing a pending timer to be saved/restored.
+
+KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+ per-vCPU local APIC upcall vector, configured by a Xen guest with
+ the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
+ used by Windows guests, and is distinct from the HVM-wide upcall
+ vector configured with HVM_PARAM_CALLBACK_IRQ.
+
+
4.129 KVM_XEN_VCPU_GET_ATTR
---------------------------
@@ -5645,6 +5729,25 @@ enabled with ``arch_prctl()``, but this may change in the future.
The offsets of the state save areas in struct kvm_xsave follow the contents
of CPUID leaf 0xD on the host.
+4.135 KVM_XEN_HVM_EVTCHN_SEND
+-----------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_irq_routing_xen_evtchn
+:Returns: 0 on success, < 0 on error
+
+
+::
+
+ struct kvm_irq_routing_xen_evtchn {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ };
+
+This ioctl injects an event channel interrupt directly to the guest vCPU.
5. The kvm_run structure
========================
@@ -5985,6 +6088,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
+ #define KVM_SYSTEM_EVENT_SEV_TERM 4
__u32 type;
__u32 ndata;
__u64 data[16];
@@ -6009,6 +6113,8 @@ Valid values for 'type' are:
has requested a crash condition maintenance. Userspace can choose
to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM.
+ - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
+ The guest physical address of the guest's GHCB is stored in `data[0]`.
If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
architecture specific information for the system-level event. Only
@@ -7145,6 +7251,15 @@ The valid bits in cap.args[0] are:
Additionally, when this quirk is disabled,
KVM clears CPUID.01H:ECX[bit 3] if
IA32_MISC_ENABLE[bit 18] is cleared.
+
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN By default, KVM rewrites guest
+ VMMCALL/VMCALL instructions to match the
+ vendor's hypercall instruction for the
+ system. When this quirk is disabled, KVM
+ will no longer rewrite invalid guest
+ hypercall instructions. Executing the
+ incorrect hypercall instruction will
+ generate a #UD within the guest.
=================================== ============================================
8. Other capabilities.
@@ -7622,8 +7737,9 @@ PVHVM guests. Valid flags are::
#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
- #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 2)
- #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 3)
+ #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+ #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+ #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
ioctl is available, for the guest to set its hypercall page.
@@ -7647,6 +7763,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
field set to indicate 2 level event channel delivery.
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
+injecting event channel events directly into the guest with the
+KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
+KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
+related to event channel delivery, timers, and the XENVER_version
+interception.
+
8.31 KVM_CAP_PPC_MULTITCE
-------------------------
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3c368b639c04..96e4e9842dfc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
new file mode 100644
index 000000000000..fdfd8e06fee6
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites.
+ */
+KVM_X86_PMU_OP(pmc_perf_hw_id)
+KVM_X86_PMU_OP(pmc_is_enabled)
+KVM_X86_PMU_OP(pmc_idx_to_pmc)
+KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
+KVM_X86_PMU_OP(msr_idx_to_pmc)
+KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP(is_valid_msr)
+KVM_X86_PMU_OP(get_msr)
+KVM_X86_PMU_OP(set_msr)
+KVM_X86_PMU_OP(refresh)
+KVM_X86_PMU_OP(init)
+KVM_X86_PMU_OP(reset)
+KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
+KVM_X86_PMU_OP_OPTIONAL(cleanup)
+
+#undef KVM_X86_PMU_OP
+#undef KVM_X86_PMU_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e0c0f0e1f754..e1c695f72b8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -443,14 +443,6 @@ struct kvm_mmu {
u8 shadow_root_level;
u8 ept_ad;
bool direct_map;
- struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
-
- /*
- * Bitmap; bit set = permission fault
- * Byte index: page fault error code [4:1]
- * Bit index: pte permissions in ACC_* format
- */
- u8 permissions[16];
/*
* The pkru_mask indicates if protection key checks are needed. It
@@ -460,6 +452,15 @@ struct kvm_mmu {
*/
u32 pkru_mask;
+ struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
u64 *pae_root;
u64 *pml4_root;
u64 *pml5_root;
@@ -607,16 +608,21 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_xen {
u64 hypercall_rip;
u32 current_runstate;
- bool vcpu_info_set;
- bool vcpu_time_info_set;
- bool runstate_set;
- struct gfn_to_hva_cache vcpu_info_cache;
- struct gfn_to_hva_cache vcpu_time_info_cache;
- struct gfn_to_hva_cache runstate_cache;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache vcpu_info_cache;
+ struct gfn_to_pfn_cache vcpu_time_info_cache;
+ struct gfn_to_pfn_cache runstate_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
unsigned long evtchn_pending_sel;
+ u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+ u32 timer_virq;
+ u64 timer_expires; /* In guest epoch */
+ atomic_t timer_pending;
+ struct hrtimer timer;
+ int poll_evtchn;
+ struct timer_list poll_timer;
};
struct kvm_vcpu_arch {
@@ -753,8 +759,7 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
- struct gfn_to_hva_cache pv_time;
- bool pv_time_enabled;
+ struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
@@ -1024,9 +1029,12 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ u32 xen_version;
bool long_mode;
u8 upcall_vector;
struct gfn_to_pfn_cache shinfo_cache;
+ struct idr evtchn_ports;
+ unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
};
enum kvm_irqchip_mode {
@@ -1119,6 +1127,8 @@ struct kvm_arch {
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
+ u32 default_tsc_khz;
+
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
u64 master_kernel_ns;
@@ -1455,8 +1465,6 @@ struct kvm_x86_ops {
int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
- /* pmu operations of sub-arch */
- const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
@@ -1498,6 +1506,11 @@ struct kvm_x86_ops {
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+ /*
+ * Returns vCPU specific APICv inhibit reasons
+ */
+ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
@@ -1527,6 +1540,7 @@ struct kvm_x86_init_ops {
unsigned int (*handle_intel_pt_intr)(void);
struct kvm_x86_ops *runtime_ops;
+ struct kvm_pmu_ops *pmu_ops;
};
struct kvm_arch_async_pf {
@@ -1548,20 +1562,6 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
-static inline void kvm_ops_static_call_update(void)
-{
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-}
-
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1799,6 +1799,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
@@ -1988,6 +1989,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_CD_NW_CLEARED | \
KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
- KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index f78e2b3501a1..35f222aa66bf 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -382,6 +382,103 @@ do { \
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+#endif // CONFIG_X86_32
+#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ int __err = 0; \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ CC_SET(z) \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[errout]) \
+ : CC_OUT(z) (success), \
+ [errout] "+r" (__err), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory", "cc"); \
+ if (unlikely(__err)) \
+ goto label; \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+/*
+ * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
+ * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses
+ * both ESI and EDI for the memory operand, compilation will fail if the error
+ * is an input+output as there will be no register available for input.
+ */
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ int __result; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ "mov $0, %%ecx\n\t" \
+ "setz %%cl\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
+ : [result]"=c" (__result), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory", "cc"); \
+ if (unlikely(__result < 0)) \
+ goto label; \
+ if (unlikely(!__result)) \
+ *_old = __old; \
+ likely(__result); })
+#endif // CONFIG_X86_32
+#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))
@@ -474,6 +571,51 @@ do { \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+extern void __try_cmpxchg_user_wrong_size(void);
+
+#ifndef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \
+ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+#endif
+
+/*
+ * Force the pointer to u<size> to match the size expected by the asm helper.
+ * clang/LLVM compiles all cases and only discards the unused paths after
+ * processing errors, which breaks i386 if the pointer is an 8-byte value.
+ */
+#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ bool __ret; \
+ __chk_user_ptr(_ptr); \
+ switch (sizeof(*(_ptr))) { \
+ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \
+ (__force u8 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \
+ (__force u16 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \
+ (__force u32 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
+ (_nval), _label); \
+ break; \
+ default: __try_cmpxchg_user_wrong_size(); \
+ } \
+ __ret; })
+
+/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
+#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ int __ret = -EFAULT; \
+ __uaccess_begin_nospec(); \
+ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \
+_label: \
+ __uaccess_end(); \
+ __ret; \
+ })
+
/*
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0ffaa3156a4e..6c343c6a1855 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -543,16 +543,14 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_ACC_READ_BIT 0
#define EPT_VIOLATION_ACC_WRITE_BIT 1
#define EPT_VIOLATION_ACC_INSTR_BIT 2
-#define EPT_VIOLATION_READABLE_BIT 3
-#define EPT_VIOLATION_WRITABLE_BIT 4
-#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_RWX_SHIFT 3
+#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
-#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
-#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
-#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
+#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
/*
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index bf6e96011dfe..21614807a2cb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -428,11 +428,12 @@ struct kvm_sync_regs {
struct kvm_vcpu_events events;
};
-#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b14533af7676..9b698215d261 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,7 +5,7 @@
#include <asm/ia32.h>
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
#endif
@@ -20,7 +20,7 @@ int main(void)
BLANK();
#endif
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a22deb58f86d..d0bb2b3fb305 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -752,6 +752,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
}
#endif
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+ASM_ENDBR
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+ASM_RET
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
+
static void __init kvm_guest_init(void)
{
int i;
@@ -764,6 +800,9 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
static_call_update(pv_steal_clock, kvm_steal_clock);
+
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1005,40 +1044,6 @@ static void kvm_wait(u8 *ptr, u8 val)
}
}
-#ifdef CONFIG_X86_32
-__visible bool __kvm_vcpu_is_preempted(long cpu)
-{
- struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-
- return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-
-#else
-
-#include <asm/asm-offsets.h>
-
-extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
-
-/*
- * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
- * restoring to/from the stack.
- */
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
-
-#endif
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -1082,10 +1087,6 @@ void __init kvm_spinlock_init(void)
pv_ops.lock.wait = kvm_wait;
pv_ops.lock.kick = kvm_kick_cpu;
- if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_ops.lock.vcpu_is_preempted =
- PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
- }
/*
* When PV spinlock is enabled which is preferred over
* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index c5caa7311bd8..16333ba1904b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
- if (!kvm_para_available() || !kvmclock)
+ if (!kvm_para_available() || !kvmclock || nopv)
return 0;
kvmclock_init_mem();
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index be99dc86293d..e1bb6218bb96 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 172b05343cfd..f371f1292ca3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,10 +22,14 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
+ int r = 0;
+
if (lapic_in_kernel(vcpu))
- return apic_has_pending_timer(vcpu);
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return 0;
+ return r;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu))
kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6e0dab04320e..0687162c4f22 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (!level)
return -1;
- return kvm_xen_set_evtchn_fast(e, kvm);
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
#endif
default:
break;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 64a2a7e2be90..77785587332e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1866,17 +1866,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
- if (ret < 0) {
+ if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- return !!ret;
+ return ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1998,7 +1995,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
for_each_sp(pages, sp, parents, i) {
kvm_unlink_unsync_page(vcpu->kvm, sp);
- flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@@ -2039,6 +2036,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
+ int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
@@ -2091,11 +2089,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
* If the sync fails, the page is zapped. If so, break
* in order to rebuild it.
*/
- if (!kvm_sync_page(vcpu, sp, &invalid_list))
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (ret > 0)
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
__clear_sp_write_flooding_count(sp);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 01fee5f67ac3..7d4377f1ef2a 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
}
-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t __user *ptep_user, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- signed char r;
-
- if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
- return -EFAULT;
-
-#ifdef CMPXCHG
- asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
- "setnz %b[r]\n"
- "2:"
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
- : [ptr] "+m" (*ptep_user),
- [old] "+a" (orig_pte),
- [r] "=q" (r)
- : [new] "r" (new_pte)
- : "memory");
-#else
- asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
- "setnz %b[r]\n"
- "2:"
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
- : [ptr] "+m" (*ptep_user),
- [old] "+A" (orig_pte),
- [r] "=q" (r)
- : [new_lo] "b" ((u32)new_pte),
- [new_hi] "c" ((u32)(new_pte >> 32))
- : "memory");
-#endif
-
- user_access_end();
- return r;
-}
-
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
@@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (unlikely(!walker->pte_writable[level - 1]))
continue;
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
if (ret)
return ret;
@@ -515,14 +479,21 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= 0x180;
+ vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
if (write_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
- vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eca39f56c231..618f529f1c4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -49,6 +49,32 @@
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
*/
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
+}
+
static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
@@ -213,7 +239,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED))) {
- config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -263,7 +289,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
+ static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
pmi);
@@ -272,7 +298,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
{
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
if (!pmc)
return;
@@ -294,7 +320,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
int bit;
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
if (unlikely(!pmc || !pmc->perf_event)) {
clear_bit(bit, pmu->reprogram_pmi);
@@ -316,7 +342,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
/* check if idx is a valid index to access PMU */
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
+ return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -366,7 +392,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
+ pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
@@ -382,22 +408,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- if (kvm_x86_ops.pmu_ops->deliver_pmi)
- kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
+ static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
- return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
- kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
+ return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
+ static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -405,13 +430,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
}
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}
/* refresh PMU settings. This function generally is called when underlying
@@ -420,7 +445,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops.pmu_ops->refresh(vcpu);
+ static_call(kvm_x86_pmu_refresh)(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
@@ -428,7 +453,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
irq_work_sync(&pmu->irq_work);
- kvm_x86_ops.pmu_ops->reset(vcpu);
+ static_call(kvm_x86_pmu_reset)(vcpu);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -436,7 +461,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- kvm_x86_ops.pmu_ops->init(vcpu);
+ static_call(kvm_x86_pmu_init)(vcpu);
init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
pmu->event_count = 0;
pmu->need_cleanup = false;
@@ -468,14 +493,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmu->pmc_in_use, X86_PMC_IDX_MAX);
for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
- pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}
- if (kvm_x86_ops.pmu_ops->cleanup)
- kvm_x86_ops.pmu_ops->cleanup(vcpu);
+ static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -505,7 +529,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
unsigned int config;
pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
- config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
pmc->eventsel = old_eventsel;
return config == perf_hw_id;
}
@@ -533,7 +557,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
int i;
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
continue;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 9e66fba1d6a3..2a53b6c9495c 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
void (*cleanup)(struct kvm_vcpu *vcpu);
};
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
return pmc->type == KVM_PMC_FIXED;
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
-{
- return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc);
-}
-
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 421619540ff9..9b859218af59 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -165,9 +165,8 @@ free_avic:
return err;
}
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
- struct vmcb *vmcb = svm->vmcb;
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@ -357,6 +356,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
return 1;
}
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f..caa691229b71 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -36,40 +36,43 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
- if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
* TODO: track the cause of the nested page fault, and
* correctly fill in the high bits of exit_info_1.
*/
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = (1ULL << 32);
- svm->vmcb->control.exit_info_2 = fault->address;
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
}
- svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
- svm->vmcb->control.exit_info_1 |= fault->error_code;
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
nested_svm_vmexit(svm);
}
static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(!is_guest_mode(vcpu));
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ WARN_ON(!is_guest_mode(vcpu));
if (vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
- nested_svm_vmexit(svm);
- } else {
- kvm_inject_page_fault(vcpu, fault);
- }
+ !svm->nested.nested_run_pending) {
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = fault->error_code;
+ vmcb->control.exit_info_2 = fault->address;
+ nested_svm_vmexit(svm);
+ } else {
+ kvm_inject_page_fault(vcpu, fault);
+ }
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -121,6 +124,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!svm->v_vmload_vmsave_enabled)
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
@@ -162,8 +179,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
if (!intercept_smi)
vmcb_clr_intercept(c, INTERCEPT_SMI);
- vmcb_set_intercept(c, INTERCEPT_VMLOAD);
- vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
}
/*
@@ -413,6 +439,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
*/
mask &= ~V_IRQ_MASK;
}
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -454,11 +484,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
vmcb12->control.exit_int_info = exit_int_info;
}
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
-{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
-}
-
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
@@ -515,6 +540,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
nested_vmcb02_compute_g_pat(svm);
@@ -526,18 +553,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
- vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
- vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
}
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
@@ -554,47 +581,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = vmcb12->save.rax;
- svm->vmcb->save.rsp = vmcb12->save.rsp;
- svm->vmcb->save.rip = vmcb12->save.rip;
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
/* These bits will be set properly on the first execution when new_vmc12 is true */
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
- svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
- vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ svm_update_lbrv(&svm->vcpu);
+
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb02, vmcb01);
}
}
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
- const u32 int_ctl_vmcb01_bits =
- V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
-
- const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
*/
- /*
- * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
- * avic_physical_id.
- */
- WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+ if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
/* Copied from vmcb01. msrpm_base can be overwritten later. */
- svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
- svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
- svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* nested_cr3. */
if (nested_npt_enabled(svm))
@@ -605,21 +644,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
svm->nested.ctl.tsc_offset,
svm->tsc_ratio_msr);
- svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
nested_svm_update_tsc_ratio_msr(vcpu);
}
- svm->vmcb->control.int_ctl =
+ vmcb02->control.int_ctl =
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
- (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
-
- svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
- svm->vmcb->control.int_state = svm->nested.ctl.int_state;
- svm->vmcb->control.event_inj = svm->nested.ctl.event_inj;
- svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+ LBR_CTL_ENABLE_MASK;
+ if (svm->lbrv_enabled)
+ vmcb02->control.virt_ext |=
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't use them */
+ vmcb02->control.pause_filter_count =
+ svm->pause_filter_enabled ?
+ svm->nested.ctl.pause_filter_count : 0;
+
+ vmcb02->control.pause_filter_thresh =
+ svm->pause_threshold_enabled ?
+ svm->nested.ctl.pause_filter_thresh : 0;
+
+ } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ /* use host values when guest doesn't use them */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+ } else {
+ /*
+ * Intercept every PAUSE otherwise and
+ * ignore both host and guest values
+ */
+ vmcb02->control.pause_filter_count = 0;
+ vmcb02->control.pause_filter_thresh = 0;
+ }
nested_svm_transition_tlb_flush(vcpu);
@@ -688,6 +759,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
svm_set_gif(svm, true);
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
return 0;
}
@@ -698,6 +772,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
struct vmcb *vmcb12;
struct kvm_host_map map;
u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
if (!svm->nested.hsave_msr) {
kvm_inject_gp(vcpu, 0);
@@ -741,14 +816,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
* Since vmcb01 is not in use, we can use it to store some of the L1
* state.
*/
- svm->vmcb01.ptr->save.efer = vcpu->arch.efer;
- svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu);
- svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4;
- svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
- svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu);
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
if (!npt_enabled)
- svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
@@ -814,14 +889,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
struct vmcb *vmcb12;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
int rc;
- /* Triple faults in L2 should never escape. */
- WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
@@ -843,57 +916,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/* Give the current vmcb to the guest */
- vmcb12->save.es = vmcb->save.es;
- vmcb12->save.cs = vmcb->save.cs;
- vmcb12->save.ss = vmcb->save.ss;
- vmcb12->save.ds = vmcb->save.ds;
- vmcb12->save.gdtr = vmcb->save.gdtr;
- vmcb12->save.idtr = vmcb->save.idtr;
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
vmcb12->save.cr0 = kvm_read_cr0(vcpu);
vmcb12->save.cr3 = kvm_read_cr3(vcpu);
- vmcb12->save.cr2 = vmcb->save.cr2;
+ vmcb12->save.cr2 = vmcb02->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
vmcb12->save.rflags = kvm_get_rflags(vcpu);
vmcb12->save.rip = kvm_rip_read(vcpu);
vmcb12->save.rsp = kvm_rsp_read(vcpu);
vmcb12->save.rax = kvm_rax_read(vcpu);
- vmcb12->save.dr7 = vmcb->save.dr7;
+ vmcb12->save.dr7 = vmcb02->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
- vmcb12->save.cpl = vmcb->save.cpl;
+ vmcb12->save.cpl = vmcb02->save.cpl;
- vmcb12->control.int_state = vmcb->control.int_state;
- vmcb12->control.exit_code = vmcb->control.exit_code;
- vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
- vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
- vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
- vmcb12->control.next_rip = vmcb->control.next_rip;
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ svm_copy_lbrs(vmcb12, vmcb02);
+ svm_update_lbrv(vcpu);
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb01, vmcb02);
+ svm_update_lbrv(vcpu);
+ }
+
/*
* On vmexit the GIF is set to false and
* no event can be injected in L1.
*/
svm_set_gif(svm, false);
- svm->vmcb->control.exit_int_info = 0;
+ vmcb01->control.exit_int_info = 0;
svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
- if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
- vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
}
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
@@ -907,13 +991,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/*
* Restore processor state that had been saved in vmcb01
*/
- kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
- svm_set_efer(vcpu, svm->vmcb->save.efer);
- svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
- svm_set_cr4(vcpu, svm->vmcb->save.cr4);
- kvm_rax_write(vcpu, svm->vmcb->save.rax);
- kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
- kvm_rip_write(vcpu, svm->vmcb->save.rip);
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
svm->vcpu.arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(&svm->vcpu);
@@ -931,7 +1015,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
if (rc)
return 1;
@@ -949,9 +1033,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
* right now so that it an be accounted for before we execute
* L1's next instruction.
*/
- if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_vcpu_update_apicv(vcpu);
+
return 0;
}
@@ -1162,12 +1253,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
unsigned int nr = svm->vcpu.arch.exception.nr;
+ struct vmcb *vmcb = svm->vmcb;
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ vmcb->control.exit_code_hi = 0;
if (svm->vcpu.arch.exception.has_error_code)
- svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+ vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
/*
* EXITINFO2 is undefined for all exception intercepts other
@@ -1175,11 +1267,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
*/
if (nr == PF_VECTOR) {
if (svm->vcpu.arch.exception.nested_apf)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else if (svm->vcpu.arch.exception.has_payload)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
else
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
} else if (nr == DB_VECTOR) {
/* See inject_pending_event. */
kvm_deliver_exception_payload(&svm->vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 24eb935b6f85..57ab4739eb19 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -319,7 +319,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
}
-struct kvm_pmu_ops amd_pmu_ops = {
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
.pmc_perf_hw_id = amd_pmc_perf_hw_id,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 537aaddc852f..9ce162119b23 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2738,8 +2738,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- ret = -EINVAL;
- break;
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
default:
/* Error, keep GHCB MSR value as-is */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index bd4c64b362d2..22bbd69495ad 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
static bool erratum_383_found __read_mostly;
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@ -172,7 +170,7 @@ static int vls = true;
module_param(vls, int, 0444);
/* enable/disable Virtual GIF */
-static int vgif = true;
+int vgif = true;
module_param(vgif, int, 0444);
/* enable/disable LBR virtualization */
@@ -189,6 +187,9 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -790,6 +791,17 @@ static void init_msrpm_offsets(void)
}
}
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -799,6 +811,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
}
static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@@ -810,6 +826,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+ * on nested guest entries.
+ */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+ /*
+ * If the LBR virtualization is disabled, the LBR msrs are always
+ * kept in the vmcb01 to avoid copying them on nested guest entries.
+ *
+ * If nested, and the LBR virtualization is enabled/disabled, the msrs
+ * are moved between the vmcb01 and vmcb02 as needed.
+ */
+ struct vmcb *vmcb =
+ (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+ svm->vmcb : svm->vmcb01.ptr;
+
+ switch (index) {
+ case MSR_IA32_DEBUGCTLMSR:
+ return vmcb->save.dbgctl;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ return vmcb->save.br_from;
+ case MSR_IA32_LASTBRANCHTOIP:
+ return vmcb->save.br_to;
+ case MSR_IA32_LASTINTFROMIP:
+ return vmcb->save.last_excp_from;
+ case MSR_IA32_LASTINTTOIP:
+ return vmcb->save.last_excp_to;
+ default:
+ KVM_BUG(false, svm->vcpu.kvm,
+ "%s: Unknown MSR 0x%x", __func__, index);
+ return 0;
+ }
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+ DEBUGCTLMSR_LBR;
+
+ bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+ LBR_CTL_ENABLE_MASK);
+
+ if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+ if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+ enable_lbrv = true;
+
+ if (enable_lbrv == current_enable_lbrv)
+ return;
+
+ if (enable_lbrv)
+ svm_enable_lbrv(vcpu);
+ else
+ svm_disable_lbrv(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -831,6 +908,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ return;
+
control->pause_filter_count = __grow_ple_window(old,
pause_filter_count,
pause_filter_count_grow,
@@ -849,6 +929,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ return;
+
control->pause_filter_count =
__shrink_ple_window(old,
pause_filter_count,
@@ -960,6 +1043,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+ svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -979,8 +1064,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1104,7 +1190,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
if (kvm_vcpu_apicv_active(vcpu))
- avic_init_vmcb(svm);
+ avic_init_vmcb(svm, vmcb);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1122,10 +1208,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
}
}
- svm_hv_init_vmcb(svm->vmcb);
+ svm_hv_init_vmcb(vmcb);
init_vmcb_after_set_cpuid(vcpu);
- vmcb_mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(vmcb);
enable_gif(svm);
}
@@ -1380,7 +1466,7 @@ static void svm_set_vintr(struct vcpu_svm *svm)
/*
* The following fields are ignored when AVIC is enabled
*/
- WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
svm_set_intercept(svm, INTERCEPT_VINTR);
@@ -2142,7 +2228,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* Likewise, clear the VINTR intercept, we will set it
* again while processing KVM_REQ_EVENT if needed.
*/
- if (vgif_enabled(svm))
+ if (vgif)
svm_clr_intercept(svm, INTERCEPT_STGI);
if (svm_is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
@@ -2160,7 +2246,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* in use, we still rely on the VINTR intercept (rather than
* STGI) to detect an open interrupt window.
*/
- if (!vgif_enabled(svm))
+ if (!vgif)
svm_clear_vintr(svm);
}
}
@@ -2575,25 +2661,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_TSC_AUX:
msr_info->data = svm->tsc_aux;
break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm->vmcb->save.dbgctl;
- break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm->vmcb->save.br_from;
- break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm->vmcb->save.br_to;
- break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm->vmcb->save.last_excp_from;
- break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -2839,12 +2912,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm->vmcb->save.dbgctl = data;
- vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
- if (data & (1ULL<<0))
- svm_enable_lbrv(vcpu);
+ if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+ svm->vmcb->save.dbgctl = data;
else
- svm_disable_lbrv(vcpu);
+ svm->vmcb01.ptr->save.dbgctl = data;
+
+ svm_update_lbrv(vcpu);
+
break;
case MSR_VM_HSAVE_PA:
/*
@@ -2901,9 +2975,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
svm_clear_vintr(to_svm(vcpu));
/*
- * For AVIC, the only reason to end up here is ExtINTs.
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
*/
kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
@@ -2914,7 +2995,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
static int pause_interception(struct kvm_vcpu *vcpu)
{
bool in_kernel;
-
/*
* CPL is not made available for an SEV-ES guest, therefore
* vcpu->arch.preempted_in_kernel can never be true. Just
@@ -2922,8 +3002,7 @@ static int pause_interception(struct kvm_vcpu *vcpu)
*/
in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
- if (!kvm_pause_in_guest(vcpu->kvm))
- grow_ple_window(vcpu);
+ grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
return kvm_skip_emulated_instruction(vcpu);
@@ -3496,14 +3575,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* enabled, the STGI interception will not occur. Enable the irq
* window under the assumption that the hardware will set the GIF.
*/
- if (vgif_enabled(svm) || gif_set(svm)) {
+ if (vgif || gif_set(svm)) {
/*
* IRQ window is not needed when AVIC is enabled,
* unless we have pending ExtINT since it cannot be injected
- * via AVIC. In such case, we need to temporarily disable AVIC,
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
*/
- kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
svm_set_vintr(svm);
}
}
@@ -3516,7 +3601,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
}
@@ -3946,6 +4031,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+ svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+
+ svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+ svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+
+ svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -3963,13 +4059,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
*/
if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
-
- /*
- * Currently, AVIC does not work with nested virtualization.
- * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
- */
- if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
}
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4224,7 +4313,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
if (ret)
return ret;
@@ -4321,7 +4410,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
/* STGI will cause a vm exit */
} else {
@@ -4605,7 +4694,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.sched_in = svm_sched_in,
- .pmu_ops = &amd_pmu_ops,
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
@@ -4632,6 +4720,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
};
/*
@@ -4695,6 +4784,20 @@ static __init void svm_set_cpu_caps(void)
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -4806,15 +4909,20 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+ enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
if (enable_apicv) {
- pr_info("AVIC enabled\n");
+ if (!boot_cpu_has(X86_FEATURE_AVIC)) {
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ } else
+ pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
} else {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
}
if (vls) {
@@ -4879,6 +4987,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f77a7d2d39dd..e246793cbeae 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -33,6 +33,7 @@
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+extern int vgif;
extern bool intercept_smi;
/*
@@ -231,9 +232,14 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
- /* cached guest cpuid flags for faster access */
+ /* optional nested SVM features that are enabled for this guest */
bool nrips_enabled : 1;
bool tsc_scaling_enabled : 1;
+ bool v_vmload_vmsave_enabled : 1;
+ bool lbrv_enabled : 1;
+ bool pause_filter_enabled : 1;
+ bool pause_threshold_enabled : 1;
+ bool vgif_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
-static inline bool vgif_enabled(struct vcpu_svm *svm)
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+ return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
}
static inline void enable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
else
svm->vcpu.arch.hflags |= HF_GIF_MASK;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
extern bool dump_invalid_vmcb;
u32 svm_msrpm_offset(u32 msr);
u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
-void avic_init_vmcb(struct vcpu_svm *svm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
@@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f18744f7ff82..838ac7ab5950 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3695,12 +3695,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
{
u32 idt_vectoring;
unsigned int nr;
- if (vcpu->arch.exception.injected) {
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -3733,6 +3755,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
idt_vectoring |= INTR_TYPE_EXT_INTR;
vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
}
}
@@ -4202,12 +4226,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (to_vmx(vcpu)->exit_reason.enclave_mode)
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
- vmcs12->vm_exit_intr_info = exit_intr_info;
-
- vmcs12->idt_vectoring_info_field = 0;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
@@ -4219,7 +4243,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Transfer the event that L0 or L1 may wanted to inject into
* L2 to IDT_VECTORING_INFO_FIELD.
*/
- vmcs12_save_pending_event(vcpu, vmcs12);
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
* According to spec, there's no need to store the guest's
@@ -4518,9 +4547,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
- /* Similarly, triple faults in L2 should never escape. */
- WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
/*
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bc3f8512bb64..9db662399487 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -723,7 +723,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
intel_pmu_release_guest_lbr_event(vcpu);
}
-struct kvm_pmu_ops intel_pmu_ops = {
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
.pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index e325c290a816..2b9d7a7e83f7 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -104,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info)
return is_exception_n(intr_info, BP_VECTOR);
}
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
static inline bool is_page_fault(u32 intr_info)
{
return is_exception_n(intr_info, PF_VECTOR);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04d170c4b61e..df0b70ccd289 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2444,7 +2444,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
@@ -4380,7 +4380,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -5405,9 +5405,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
/* ept page table entry is present? */
- error_code |= (exit_qualification &
- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
- EPT_VIOLATION_EXECUTABLE))
+ error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
@@ -7818,7 +7816,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
@@ -8072,6 +8069,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 07d789b1d366..1bc67995cc8a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -961,11 +961,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
- vcpu->arch.pkru != vcpu->arch.host_pkru)
+ vcpu->arch.pkru != vcpu->arch.host_pkru &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -974,13 +976,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@ -2249,14 +2253,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
- vcpu->arch.pv_time_enabled = false;
- if (!(system_time & 1))
- return;
-
- if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.pv_time, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = true;
+ if (system_time & 1) {
+ kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+ KVM_HOST_USES_PFN, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ } else {
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ }
return;
}
@@ -2961,63 +2964,55 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return data.clock;
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
- struct gfn_to_hva_cache *cache,
- unsigned int offset)
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
- struct pvclock_vcpu_time_info guest_hv_clock;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
+ unsigned long flags;
- if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
- &guest_hv_clock, offset, sizeof(guest_hv_clock))))
- return;
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
- /* This VCPU is paused, but it's legal for a guest to read another
+ guest_hv_clock = (void *)(gpc->khva + offset);
+
+ /*
+ * This VCPU is paused, but it's legal for a guest to read another
* VCPU's kvmclock, so we really have to follow the specification where
* it says that version is odd if data is being modified, and even after
* it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
*/
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- if (guest_hv_clock.version & 1)
- ++guest_hv_clock.version; /* first time write, random junk */
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+ memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+ smp_wmb();
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock));
+ guest_hv_clock->version = ++vcpu->hv_clock.version;
- smp_wmb();
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
- vcpu->hv_clock.version++;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -3106,13 +3101,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
- if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
- if (vcpu->xen.vcpu_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
- if (vcpu->xen.vcpu_time_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ if (vcpu->pv_time.active)
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -3300,7 +3295,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- vcpu->arch.pv_time_enabled = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
@@ -4284,7 +4279,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_SHARED_INFO |
- KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
break;
@@ -4331,6 +4327,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = boot_cpu_has(X86_FEATURE_XSAVE);
break;
case KVM_CAP_TSC_CONTROL:
+ case KVM_CAP_VM_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
case KVM_CAP_X2APIC_API:
@@ -5102,7 +5099,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.pv_time_enabled)
+ if (!vcpu->arch.pv_time.active)
return -EINVAL;
vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -6186,7 +6183,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!vcpu->arch.pv_time_enabled)
+ if (!vcpu->arch.pv_time.active)
continue;
ret = kvm_set_guest_paused(vcpu);
@@ -6513,6 +6510,15 @@ set_pit2_out:
r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
+ case KVM_XEN_HVM_EVTCHN_SEND: {
+ struct kvm_irq_routing_xen_evtchn uxe;
+
+ r = -EFAULT;
+ if (copy_from_user(&uxe, argp, sizeof(uxe)))
+ goto out;
+ r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+ break;
+ }
#endif
case KVM_SET_CLOCK:
r = kvm_vm_ioctl_set_clock(kvm, argp);
@@ -6520,6 +6526,28 @@ set_pit2_out:
case KVM_GET_CLOCK:
r = kvm_vm_ioctl_get_clock(kvm, argp);
break;
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = READ_ONCE(kvm->arch.default_tsc_khz);
+ goto out;
+ }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (!kvm_x86_ops.mem_enc_ioctl)
@@ -7229,15 +7257,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
exception, &write_emultor);
}
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+ (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
@@ -7246,12 +7267,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int bytes,
struct x86_exception *exception)
{
- struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u64 page_line_mask;
+ unsigned long hva;
gpa_t gpa;
- char *kaddr;
- bool exchanged;
+ int r;
/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
@@ -7275,31 +7295,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
+ hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(addr))
goto emul_write;
- kaddr = map.hva + offset_in_page(gpa);
+ hva += offset_in_page(gpa);
switch (bytes) {
case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u8, hva, old, new);
break;
case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u16, hva, old, new);
break;
case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u32, hva, old, new);
break;
case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u64, hva, old, new);
break;
default:
BUG();
}
- kvm_vcpu_unmap(vcpu, &map, true);
-
- if (!exchanged)
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+ if (r)
return X86EMUL_CMPXCHG_FAILED;
kvm_page_track_write(vcpu, gpa, new, bytes);
@@ -8789,22 +8810,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
static void kvm_timer_init(void)
{
- max_tsc_khz = tsc_khz;
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy *policy;
- int cpu;
-
- cpu = get_cpu();
- policy = cpufreq_cpu_get(cpu);
- if (policy) {
- if (policy->cpuinfo.max_freq)
- max_tsc_khz = policy->cpuinfo.max_freq;
- cpufreq_cpu_put(policy);
+ max_tsc_khz = tsc_khz;
+
+ if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+ struct cpufreq_policy *policy;
+ int cpu;
+
+ cpu = get_cpu();
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
+ put_cpu();
}
- put_cpu();
-#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
@@ -9089,6 +9110,14 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+ ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+ ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+ return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
@@ -9266,6 +9295,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
+ /*
+ * If the quirk is disabled, synthesize a #UD and let the guest pick up
+ * the pieces.
+ */
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ ctxt->exception.error_code_valid = false;
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->have_exception = true;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
@@ -9763,7 +9803,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
down_read(&vcpu->kvm->arch.apicv_update_lock);
- activate = kvm_apicv_activated(vcpu->kvm);
+ activate = kvm_vcpu_apicv_activated(vcpu);
+
if (vcpu->arch.apicv_active == activate)
goto out;
@@ -10166,7 +10207,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* per-VM state, and responsing vCPUs must wait for the update
* to complete before servicing KVM_REQ_APICV_UPDATE.
*/
- WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+ WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@ -10364,6 +10405,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
break;
kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ kvm_xen_inject_pending_events(vcpu);
+
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -11249,9 +11293,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ kvm_xen_init_vcpu(vcpu);
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu);
vcpu_put(vcpu);
@@ -11306,6 +11351,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+ kvm_xen_destroy_vcpu(vcpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
@@ -11567,6 +11613,24 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
int kvm_arch_hardware_setup(void *opaque)
{
struct kvm_x86_init_ops *ops = opaque;
@@ -11581,8 +11645,7 @@ int kvm_arch_hardware_setup(void *opaque)
if (r != 0)
return r;
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
- kvm_ops_static_call_update();
+ kvm_ops_update(ops);
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
@@ -11698,6 +11761,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
pvclock_update_vm_gtod_copy(kvm);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
@@ -12179,6 +12243,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
return true;
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
return false;
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index bf6cc25eee76..610beba35907 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -9,17 +9,25 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
+#include "lapic.h"
+#include <linux/eventfd.h>
#include <linux/kvm_host.h>
#include <linux/sched/stat.h>
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
#include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
#include "trace.h"
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
@@ -102,6 +110,66 @@ out:
return ret;
}
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+ struct kvm_xen_evtchn e;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+ arch.xen.timer);
+ if (atomic_read(&vcpu->arch.xen.timer_pending))
+ return HRTIMER_NORESTART;
+
+ atomic_inc(&vcpu->arch.xen.timer_pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ vcpu->arch.xen.timer_expires = guest_abs;
+
+ if (delta_ns <= 0) {
+ xen_timer_callback(&vcpu->arch.xen.timer);
+ } else {
+ ktime_t ktime_now = ktime_get();
+ hrtimer_start(&vcpu->arch.xen.timer,
+ ktime_add_ns(ktime_now, delta_ns),
+ HRTIMER_MODE_ABS_HARD);
+ }
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -133,27 +201,36 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
- struct kvm_memslots *slots = kvm_memslots(v->kvm);
- bool atomic = (state == RUNSTATE_runnable);
- uint64_t state_entry_time;
- int __user *user_state;
- uint64_t __user *user_times;
+ struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
+ uint64_t *user_times;
+ unsigned long flags;
+ size_t user_len;
+ int *user_state;
kvm_xen_update_runstate(v, state);
- if (!vx->runstate_set)
+ if (!vx->runstate_cache.active)
return;
- if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
- kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
- return;
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_len = sizeof(struct vcpu_runstate_info);
+ else
+ user_len = sizeof(struct compat_vcpu_runstate_info);
- /* We made sure it fits in a single page */
- BUG_ON(!ghc->memslot);
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ user_len)) {
+ read_unlock_irqrestore(&gpc->lock, flags);
- if (atomic)
- pagefault_disable();
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (state == RUNSTATE_runnable)
+ return;
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
/*
* The only difference between 32-bit and 64-bit versions of the
@@ -167,38 +244,33 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
- user_state = (int __user *)ghc->hva;
-
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
-
- user_times = (uint64_t __user *)(ghc->hva +
- offsetof(struct compat_vcpu_runstate_info,
- state_entry_time));
#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
-
- if (v->kvm->arch.xen.long_mode)
- user_times = (uint64_t __user *)(ghc->hva +
- offsetof(struct vcpu_runstate_info,
- state_entry_time));
#endif
+
+ user_state = gpc->khva;
+
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ else
+ user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+
/*
* First write the updated state_entry_time at the appropriate
* location determined by 'offset'.
*/
- state_entry_time = vx->runstate_entry_time;
- state_entry_time |= XEN_RUNSTATE_UPDATE;
-
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
- sizeof(state_entry_time));
+ sizeof(user_times[0]));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
- sizeof(state_entry_time));
+ sizeof(user_times[0]));
- if (__put_user(state_entry_time, user_times))
- goto out;
+ user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
smp_wmb();
/*
@@ -212,8 +284,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (__put_user(vx->current_runstate, user_state))
- goto out;
+ *user_state = vx->current_runstate;
/*
* Write the actual runstate times immediately after the
@@ -228,42 +299,114 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
- goto out;
+ memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
- state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- __put_user(state_entry_time, user_times);
+ user_times[0] &= ~XEN_RUNSTATE_UPDATE;
smp_wmb();
- out:
- mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
- if (atomic)
- pagefault_enable();
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
-int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+ struct kvm_lapic_irq irq = { };
+ int r;
+
+ irq.dest_id = v->vcpu_id;
+ irq.vector = v->arch.xen.upcall_vector;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.level = 1;
+
+ /* The fast version will always work for physical unicast */
+ WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
{
unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
- bool atomic = in_atomic() || !task_is_running(current);
- int err;
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+
+ if (!evtchn_pending_sel)
+ return;
+
+ /*
+ * Yes, this is an open-coded loop. But that's just what put_user()
+ * does anyway. Page it in and retry the instruction. We're just a
+ * little more honest about it.
+ */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ /* Now gpc->khva is a valid kernel address for the vcpu_info */
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orq %0, %1\n"
+ "notq %0\n"
+ LOCK_PREFIX "andq %0, %2\n"
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ } else {
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+ struct compat_vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orl %0, %1\n"
+ "notl %0\n"
+ LOCK_PREFIX "andl %0, %2\n"
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ }
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (v->arch.xen.upcall_vector)
+ kvm_xen_inject_vcpu_vector(v);
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
u8 rc = 0;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
*/
- struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
- struct kvm_memslots *slots = kvm_memslots(v->kvm);
- bool ghc_valid = slots->generation == ghc->generation &&
- !kvm_is_error_hva(ghc->hva) && ghc->memslot;
-
- unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
/* No need for compat handling here */
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
@@ -273,101 +416,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- /*
- * For efficiency, this mirrors the checks for using the valid
- * cache in kvm_read_guest_offset_cached(), but just uses
- * __get_user() instead. And falls back to the slow path.
- */
- if (!evtchn_pending_sel && ghc_valid) {
- /* Fast path */
- pagefault_disable();
- err = __get_user(rc, (u8 __user *)ghc->hva + offset);
- pagefault_enable();
- if (!err)
- return rc;
- }
-
- /* Slow path */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
- /*
- * This function gets called from kvm_vcpu_block() after setting the
- * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
- * from a HLT. So we really mustn't sleep. If the page ended up absent
- * at that point, just return 1 in order to trigger an immediate wake,
- * and we'll end up getting called again from a context where we *can*
- * fault in the page and wait for it.
- */
- if (atomic)
- return 1;
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
- if (!ghc_valid) {
- err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
- if (err || !ghc->memslot) {
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
*/
return 0;
}
+ read_lock_irqsave(&gpc->lock, flags);
}
- /*
- * Now we have a valid (protected by srcu) userspace HVA in
- * ghc->hva which points to the struct vcpu_info. If there
- * are any bits in the in-kernel evtchn_pending_sel then
- * we need to write those to the guest vcpu_info and set
- * its evtchn_upcall_pending flag. If there aren't any bits
- * to add, we only want to *check* evtchn_upcall_pending.
- */
- if (evtchn_pending_sel) {
- bool long_mode = v->kvm->arch.xen.long_mode;
-
- if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
- return 0;
-
- if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
- struct vcpu_info __user *vi = (void __user *)ghc->hva;
-
- /* Attempt to set the evtchn_pending_sel bits in the
- * guest, and if that succeeds then clear the same
- * bits in the in-kernel version. */
- asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
- "\tnotq %0\n"
- "\t" LOCK_PREFIX "andq %0, %2\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "=r" (evtchn_pending_sel),
- "+m" (vi->evtchn_pending_sel),
- "+m" (v->arch.xen.evtchn_pending_sel)
- : "0" (evtchn_pending_sel));
- } else {
- struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
- u32 evtchn_pending_sel32 = evtchn_pending_sel;
-
- /* Attempt to set the evtchn_pending_sel bits in the
- * guest, and if that succeeds then clear the same
- * bits in the in-kernel version. */
- asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
- "\tnotl %0\n"
- "\t" LOCK_PREFIX "andl %0, %2\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "=r" (evtchn_pending_sel32),
- "+m" (vi->evtchn_pending_sel),
- "+m" (v->arch.xen.evtchn_pending_sel)
- : "0" (evtchn_pending_sel32));
- }
- rc = 1;
- unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
-
- err:
- user_access_end();
-
- mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
- } else {
- __get_user(rc, (u8 __user *)ghc->hva + offset);
- }
-
+ rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+ read_unlock_irqrestore(&gpc->lock, flags);
return rc;
}
@@ -375,36 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ mutex_lock(&kvm->lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ mutex_unlock(&kvm->lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.upcall_vector = data->u.vector;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
+ case KVM_XEN_ATTR_TYPE_EVTCHN:
+ r = kvm_xen_setattr_evtchn(kvm, data);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ mutex_lock(&kvm->lock);
+ kvm->arch.xen.xen_version = data->u.xen_version;
+ mutex_unlock(&kvm->lock);
+ r = 0;
+ break;
+
default:
break;
}
- mutex_unlock(&kvm->lock);
return r;
}
@@ -433,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ data->u.xen_version = kvm->arch.xen.xen_version;
+ r = 0;
+ break;
+
default:
break;
}
@@ -457,48 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_info));
- if (!r) {
- vcpu->arch.xen.vcpu_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
+
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_time_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
- if (!r) {
- vcpu->arch.xen.vcpu_time_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
@@ -507,24 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.runstate_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_runstate_info));
- if (!r) {
- vcpu->arch.xen.runstate_set = true;
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -622,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ if (data->u.timer.port) {
+ if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+ r = -EINVAL;
+ break;
+ }
+ vcpu->arch.xen.timer_virq = data->u.timer.port;
+ kvm_xen_init_timer(vcpu);
+
+ /* Restart the timer if it's set */
+ if (data->u.timer.expires_ns)
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+ data->u.timer.expires_ns -
+ get_kvmclock_ns(vcpu->kvm));
+ } else if (kvm_xen_timer_enabled(vcpu)) {
+ kvm_xen_stop_timer(vcpu);
+ vcpu->arch.xen.timer_virq = 0;
+ }
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
default:
break;
}
@@ -639,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_set)
+ if (vcpu->arch.xen.vcpu_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -647,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
- if (vcpu->arch.xen.vcpu_time_info_set)
+ if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -659,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EOPNOTSUPP;
break;
}
- if (vcpu->arch.xen.runstate_set) {
+ if (vcpu->arch.xen.runstate_cache.active) {
data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
r = 0;
}
@@ -697,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EINVAL;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ data->u.timer.port = vcpu->arch.xen.timer_virq;
+ data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+ data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = vcpu->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
default:
break;
}
@@ -777,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
- if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ /* Only some feature flags need to be *enabled* by userspace */
+ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+ if (xhc->flags & ~permitted_flags)
return -EINVAL;
/*
@@ -802,18 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
return 0;
}
-void kvm_xen_init_vm(struct kvm *kvm)
-{
-}
-
-void kvm_xen_destroy_vm(struct kvm *kvm)
-{
- kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
-
- if (kvm->arch.xen_hvm_config.msr)
- static_branch_slow_dec_deferred(&kvm_xen_enabled);
-}
-
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
kvm_rax_write(vcpu, result);
@@ -830,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ unsigned long *pending_bits;
+ unsigned long flags;
+ bool ret = true;
+ int idx, i;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ ret = false;
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ }
+
+ for (i = 0; i < nr_ports; i++) {
+ if (test_bit(ports[i], pending_bits)) {
+ ret = true;
+ break;
+ }
+ }
+
+ out_rcu:
+ srcu_read_unlock(&kvm->srcu, idx);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+ u64 param, u64 *r)
+{
+ int idx, i;
+ struct sched_poll sched_poll;
+ evtchn_port_t port, *ports;
+ gpa_t gpa;
+
+ if (!longmode || !lapic_in_kernel(vcpu) ||
+ !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ return false;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
+ sizeof(sched_poll))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ if (unlikely(sched_poll.nr_ports > 1)) {
+ /* Xen (unofficially) limits number of pollers to 128 */
+ if (sched_poll.nr_ports > 128) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ ports = kmalloc_array(sched_poll.nr_ports,
+ sizeof(*ports), GFP_KERNEL);
+ if (!ports) {
+ *r = -ENOMEM;
+ return true;
+ }
+ } else
+ ports = &port;
+
+ for (i = 0; i < sched_poll.nr_ports; i++) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu,
+ (gva_t)(sched_poll.ports + i),
+ NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
+ &ports[i], sizeof(port))) {
+ *r = -EFAULT;
+ goto out;
+ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+ vcpu->arch.xen.poll_evtchn = port;
+ else
+ vcpu->arch.xen.poll_evtchn = -1;
+
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ if (sched_poll.timeout)
+ mod_timer(&vcpu->arch.xen.poll_timer,
+ jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+ kvm_vcpu_halt(vcpu);
+
+ if (sched_poll.timeout)
+ del_timer(&vcpu->arch.xen.poll_timer);
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+ }
+
+ vcpu->arch.xen.poll_evtchn = 0;
+ *r = 0;
+out:
+ /* Really, this is only needed in case of timeout */
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (unlikely(sched_poll.nr_ports > 1))
+ kfree(ports);
+ return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+ struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+ int cmd, u64 param, u64 *r)
+{
+ switch (cmd) {
+ case SCHEDOP_poll:
+ if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+ return true;
+ fallthrough;
+ case SCHEDOP_yield:
+ kvm_vcpu_on_spin(vcpu, true);
+ *r = 0;
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+ uint64_t timeout_abs_ns;
+ uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+ int vcpu_id, u64 param, u64 *r)
+{
+ struct vcpu_set_singleshot_timer oneshot;
+ s64 delta;
+ gpa_t gpa;
+ int idx;
+
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ switch (cmd) {
+ case VCPUOP_set_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ /*
+ * The only difference for 32-bit compat is the 4 bytes of
+ * padding after the interesting part of the structure. So
+ * for a faithful emulation of Xen we have to *try* to copy
+ * the padding and return -EFAULT if we can't. Otherwise we
+ * might as well just have copied the 12-byte 32-bit struct.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+ offsetof(struct vcpu_set_singleshot_timer, flags));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+ if (!gpa ||
+ kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+ if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+ *r = -ETIME;
+ return true;
+ }
+
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ *r = 0;
+ return true;
+
+ case VCPUOP_stop_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ kvm_xen_stop_timer(vcpu);
+ *r = 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+ u64 *r)
+{
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ if (timeout) {
+ uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+ int64_t delta = timeout - guest_now;
+
+ /* Xen has a 'Linux workaround' in do_set_timer_op() which
+ * checks for negative absolute timeout values (caused by
+ * integer overflow), and for values about 13 days in the
+ * future (2^50ns) which would be caused by jiffies
+ * overflow. For those cases, it sets the timeout 100ms in
+ * the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep
+ * churning CPU time by waking it up).
+ */
+ if (unlikely((int64_t)timeout < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ timeout = guest_now + delta;
+ }
+
+ kvm_xen_start_timer(vcpu, timeout, delta);
+ } else {
+ kvm_xen_stop_timer(vcpu);
+ }
+
+ *r = 0;
+ return true;
+}
+
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
{
bool longmode;
- u64 input, params[6];
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -864,6 +1246,40 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+ r = vcpu->kvm->arch.xen.xen_version;
+ handled = true;
+ }
+ break;
+ case __HYPERVISOR_event_channel_op:
+ if (params[0] == EVTCHNOP_send)
+ handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+ break;
+ case __HYPERVISOR_sched_op:
+ handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+ params[1], &r);
+ break;
+ case __HYPERVISOR_vcpu_op:
+ handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+ params[2], &r);
+ break;
+ case __HYPERVISOR_set_timer_op: {
+ u64 timeout = params[0];
+ /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+ if (!longmode)
+ timeout |= params[1] << 32;
+ handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
@@ -890,14 +1306,28 @@ static inline int max_evtchn_port(struct kvm *kvm)
return COMPAT_EVTCHN_2L_NR_CHANNELS;
}
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+ if ((poll_evtchn == port || poll_evtchn == -1) &&
+ test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
/*
- * This follows the kvm_set_irq() API, so it returns:
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
* = 0 Interrupt was coalesced (previous irq is still pending)
* > 0 Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
*/
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm)
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct kvm_vcpu *vcpu;
@@ -905,23 +1335,29 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
unsigned long flags;
int port_word_bit;
bool kick_vcpu = false;
- int idx;
- int rc;
+ int vcpu_idx, idx, rc;
- vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
- if (!vcpu)
- return -1;
+ vcpu_idx = READ_ONCE(xe->vcpu_idx);
+ if (vcpu_idx >= 0)
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ else {
+ vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+ if (!vcpu)
+ return -EINVAL;
+ WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+ }
- if (!vcpu->arch.xen.vcpu_info_set)
- return -1;
+ if (!vcpu->arch.xen.vcpu_info_cache.active)
+ return -EINVAL;
- if (e->xen_evtchn.port >= max_evtchn_port(kvm))
- return -1;
+ if (xe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
rc = -EWOULDBLOCK;
- read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
goto out_rcu;
@@ -929,12 +1365,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
struct shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
- port_word_bit = e->xen_evtchn.port / 64;
+ port_word_bit = xe->port / 64;
} else {
struct compat_shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
- port_word_bit = e->xen_evtchn.port / 32;
+ port_word_bit = xe->port / 32;
}
/*
@@ -944,39 +1380,68 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
* already set, then we kick the vCPU in question to write to the
* *real* evtchn_pending_sel in its own guest vcpu_info struct.
*/
- if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+ if (test_and_set_bit(xe->port, pending_bits)) {
rc = 0; /* It was already raised */
- } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
- rc = -1; /* Masked */
+ } else if (test_bit(xe->port, mask_bits)) {
+ rc = -ENOTCONN; /* Masked */
+ kvm_xen_check_poller(vcpu, xe->port);
} else {
- rc = 1; /* Delivered. But was the vCPU waking already? */
- if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
- kick_vcpu = true;
+ rc = 1; /* Delivered to the bitmap in shared_info. */
+ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+ read_unlock_irqrestore(&gpc->lock, flags);
+ gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel
+ * and prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_rcu;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
}
out_rcu:
- srcu_read_unlock(&kvm->srcu, idx);
read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
if (kick_vcpu) {
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
return rc;
}
-/* This is the version called from kvm_set_irq() as the .set function */
-static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
- int irq_source_id, int level, bool line_status)
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
bool mm_borrowed = false;
int rc;
- if (!level)
- return -1;
-
- rc = kvm_xen_set_evtchn_fast(e, kvm);
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
return rc;
@@ -1020,7 +1485,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
int idx;
- rc = kvm_xen_set_evtchn_fast(e, kvm);
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
break;
@@ -1037,11 +1502,27 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
return rc;
}
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ if (!level)
+ return -EINVAL;
+
+ return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
+ struct kvm_vcpu *vcpu;
+
if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
return -EINVAL;
@@ -1049,10 +1530,328 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
return -EINVAL;
+ /*
+ * Xen gives us interesting mappings from vCPU index to APIC ID,
+ * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+ * to find it. Do that once at setup time, instead of every time.
+ * But beware that on live update / live migration, the routing
+ * table might be reinstated before the vCPU threads have finished
+ * recreating their vCPUs.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+ if (vcpu)
+ e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ else
+ e->xen_evtchn.vcpu_idx = -1;
+
e->xen_evtchn.port = ue->u.xen_evtchn.port;
- e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
e->set = evtchn_set_fn;
return 0;
}
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+ struct kvm_xen_evtchn e;
+ int ret;
+
+ if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e.port = uxe->port;
+ e.vcpu_id = uxe->vcpu;
+ e.vcpu_idx = -1;
+ e.priority = uxe->priority;
+
+ ret = kvm_xen_set_evtchn(&e, kvm);
+
+ /*
+ * None of that 'return 1 if it actually got delivered' nonsense.
+ * We don't care if it was masked (-ENOTCONN) either.
+ */
+ if (ret > 0 || ret == -ENOTCONN)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+ u32 send_port;
+ u32 type;
+ union {
+ struct kvm_xen_evtchn port;
+ struct {
+ u32 port; /* zero */
+ struct eventfd_ctx *ctx;
+ } eventfd;
+ } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct evtchnfd *evtchnfd;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ /* For an UPDATE, nothing may change except the priority/vcpu */
+ if (evtchnfd->type != data->u.evtchn.type)
+ return -EINVAL;
+
+ /*
+ * Port cannot change, and if it's zero that was an eventfd
+ * which can't be changed either.
+ */
+ if (!evtchnfd->deliver.port.port ||
+ evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ }
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct eventfd_ctx *eventfd = NULL;
+ struct evtchnfd *evtchnfd = NULL;
+ int ret = -EINVAL;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+ if (!evtchnfd)
+ return -ENOMEM;
+
+ switch(data->u.evtchn.type) {
+ case EVTCHNSTAT_ipi:
+ /* IPI must map back to the same port# */
+ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+ goto out; /* -EINVAL */
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (data->u.evtchn.deliver.port.port) {
+ if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+ goto out; /* -EINVAL */
+ } else {
+ eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto out;
+ }
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ case EVTCHNSTAT_closed:
+ case EVTCHNSTAT_unbound:
+ case EVTCHNSTAT_pirq:
+ default: /* Unknown event channel type */
+ goto out; /* -EINVAL */
+ }
+
+ evtchnfd->send_port = data->u.evtchn.send_port;
+ evtchnfd->type = data->u.evtchn.type;
+ if (eventfd) {
+ evtchnfd->deliver.eventfd.ctx = eventfd;
+ } else {
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out; /* -EINVAL; */
+
+ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ }
+
+ mutex_lock(&kvm->lock);
+ ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+ GFP_KERNEL);
+ mutex_unlock(&kvm->lock);
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+out:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+ kfree(evtchnfd);
+ return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+ struct evtchnfd *evtchnfd;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ if (kvm)
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ mutex_lock(&kvm->lock);
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+ return kvm_xen_eventfd_reset(kvm);
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+ return kvm_xen_eventfd_deassign(kvm, port);
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+ return kvm_xen_eventfd_update(kvm, data);
+ if (data->u.evtchn.flags)
+ return -EINVAL;
+
+ return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+ struct evtchnfd *evtchnfd;
+ struct evtchn_send send;
+ gpa_t gpa;
+ int idx;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+ evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ if (!evtchnfd)
+ return false;
+
+ if (evtchnfd->deliver.port.port) {
+ int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+ if (ret < 0 && ret != -ENOTCONN)
+ return false;
+ } else {
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ }
+
+ *r = 0;
+ return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
+ del_timer_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ idr_init(&kvm->arch.xen.evtchn_ports);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index adbcc9ed59db..ee5c4ae0755c 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -15,16 +15,19 @@
extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
-
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
struct kvm *kvm);
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
@@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
if (static_branch_unlikely(&kvm_xen_enabled.key) &&
- vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->kvm->arch.xen.upcall_vector)
return __kvm_xen_has_interrupt(vcpu);
return 0;
}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+ return atomic_read(&vcpu->arch.xen.timer_pending);
+
+ return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
#else
static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
{
@@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
{
}
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
@@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
return 0;
}
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3f9b22c4983a..252ee4a61b58 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -611,7 +611,8 @@ struct kvm_hv_sint {
struct kvm_xen_evtchn {
u32 port;
- u32 vcpu;
+ u32 vcpu_id;
+ int vcpu_idx;
u32 priority;
};
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6a184d260c7f..e10d131edd80 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -444,6 +444,7 @@ struct kvm_run {
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
+#define KVM_SYSTEM_EVENT_SEV_TERM 4
__u32 type;
__u32 ndata;
union {
@@ -1150,7 +1151,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_MEM_OP_EXTENSION 211
#define KVM_CAP_PMU_CAPABILITY 212
#define KVM_CAP_DISABLE_QUIRKS2 213
-/* #define KVM_CAP_VM_TSC_CONTROL 214 */
+#define KVM_CAP_VM_TSC_CONTROL 214
#define KVM_CAP_SYSTEM_EVENT_DATA 215
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1240,6 +1241,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
struct kvm_xen_hvm_config {
__u32 flags;
@@ -1478,7 +1480,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
/* Available with KVM_CAP_PPC_GET_PVINFO */
#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
-/* Available with KVM_CAP_TSC_CONTROL */
+/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
+* KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
/* Available with KVM_CAP_PCI_2_3 */
@@ -1694,6 +1697,32 @@ struct kvm_xen_hvm_attr {
struct {
__u64 gfn;
} shared_info;
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE (1 << 1)
+#define KVM_XEN_EVTCHN_RESET (1 << 2)
+ /*
+ * Events sent by the guest are either looped back to
+ * the guest itself (potentially on a different port#)
+ * or signalled via an eventfd.
+ */
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
__u64 pad[8];
} u;
};
@@ -1702,11 +1731,17 @@ struct kvm_xen_hvm_attr {
#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0
#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1
#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
/* Per-vCPU Xen attributes */
#define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
#define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr)
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_HVM_EVTCHN_SEND _IOW(KVMIO, 0xd0, struct kvm_irq_routing_xen_evtchn)
+
#define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2)
#define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2)
@@ -1724,6 +1759,13 @@ struct kvm_xen_vcpu_attr {
__u64 time_blocked;
__u64 time_offline;
} runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
} u;
};
@@ -1734,6 +1776,10 @@ struct kvm_xen_vcpu_attr {
#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3
#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4
#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
diff --git a/init/Kconfig b/init/Kconfig
index ddcbefe535e9..b19e2eeaae80 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -77,6 +77,11 @@ config CC_HAS_ASM_GOTO_OUTPUT
depends on CC_HAS_ASM_GOTO
def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null)
+config CC_HAS_ASM_GOTO_TIED_OUTPUT
+ depends on CC_HAS_ASM_GOTO_OUTPUT
+ # Detect buggy gcc and clang, fixed in gcc-11 clang-14.
+ def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .\n": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null)
+
config TOOLS_SUPPORT_RELR
def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh)
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 0b0e4402bba6..56140068b763 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -16,6 +16,7 @@
/x86_64/debug_regs
/x86_64/evmcs_test
/x86_64/emulator_error_test
+/x86_64/fix_hypercall_test
/x86_64/get_msr_index_features
/x86_64/kvm_clock_test
/x86_64/kvm_pv_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 681b173aa87c..af582d168621 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -48,6 +48,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
+TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@@ -65,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
new file mode 100644
index 000000000000..1f5c32146f3d
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <linux/stringify.h>
+#include <stdint.h>
+
+#include "apic.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+static bool ud_expected;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+ GUEST_ASSERT(ud_expected);
+ GUEST_DONE();
+}
+
+extern unsigned char svm_hypercall_insn;
+static uint64_t svm_do_sched_yield(uint8_t apic_id)
+{
+ uint64_t ret;
+
+ asm volatile("mov %1, %%rax\n\t"
+ "mov %2, %%rbx\n\t"
+ "svm_hypercall_insn:\n\t"
+ "vmmcall\n\t"
+ "mov %%rax, %0\n\t"
+ : "=r"(ret)
+ : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
+ : "rax", "rbx", "memory");
+
+ return ret;
+}
+
+extern unsigned char vmx_hypercall_insn;
+static uint64_t vmx_do_sched_yield(uint8_t apic_id)
+{
+ uint64_t ret;
+
+ asm volatile("mov %1, %%rax\n\t"
+ "mov %2, %%rbx\n\t"
+ "vmx_hypercall_insn:\n\t"
+ "vmcall\n\t"
+ "mov %%rax, %0\n\t"
+ : "=r"(ret)
+ : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
+ : "rax", "rbx", "memory");
+
+ return ret;
+}
+
+static void assert_hypercall_insn(unsigned char *exp_insn, unsigned char *obs_insn)
+{
+ uint32_t exp = 0, obs = 0;
+
+ memcpy(&exp, exp_insn, sizeof(exp));
+ memcpy(&obs, obs_insn, sizeof(obs));
+
+ GUEST_ASSERT_EQ(exp, obs);
+}
+
+static void guest_main(void)
+{
+ unsigned char *native_hypercall_insn, *hypercall_insn;
+ uint8_t apic_id;
+
+ apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
+
+ if (is_intel_cpu()) {
+ native_hypercall_insn = &vmx_hypercall_insn;
+ hypercall_insn = &svm_hypercall_insn;
+ svm_do_sched_yield(apic_id);
+ } else if (is_amd_cpu()) {
+ native_hypercall_insn = &svm_hypercall_insn;
+ hypercall_insn = &vmx_hypercall_insn;
+ vmx_do_sched_yield(apic_id);
+ } else {
+ GUEST_ASSERT(0);
+ /* unreachable */
+ return;
+ }
+
+ GUEST_ASSERT(!ud_expected);
+ assert_hypercall_insn(native_hypercall_insn, hypercall_insn);
+ GUEST_DONE();
+}
+
+static void setup_ud_vector(struct kvm_vm *vm)
+{
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+}
+
+static void enter_guest(struct kvm_vm *vm)
+{
+ struct kvm_run *run;
+ struct ucall uc;
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_run(vm, VCPU_ID);
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_SYNC:
+ pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
+ break;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]);
+ default:
+ TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
+ uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
+ }
+}
+
+static void test_fix_hypercall(void)
+{
+ struct kvm_vm *vm;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_main);
+ setup_ud_vector(vm);
+
+ ud_expected = false;
+ sync_global_to_guest(vm, ud_expected);
+
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ enter_guest(vm);
+}
+
+static void test_fix_hypercall_disabled(void)
+{
+ struct kvm_enable_cap cap = {0};
+ struct kvm_vm *vm;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_main);
+ setup_ud_vector(vm);
+
+ cap.cap = KVM_CAP_DISABLE_QUIRKS2;
+ cap.args[0] = KVM_X86_QUIRK_FIX_HYPERCALL_INSN;
+ vm_enable_cap(vm, &cap);
+
+ ud_expected = true;
+ sync_global_to_guest(vm, ud_expected);
+
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ enter_guest(vm);
+}
+
+int main(void)
+{
+ if (!(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ print_skip("KVM_X86_QUIRK_HYPERCALL_INSN not supported");
+ exit(KSFT_SKIP);
+ }
+
+ test_fix_hypercall();
+ test_fix_hypercall_disabled();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
new file mode 100644
index 000000000000..f0083d8cfe98
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Xen shared_info / pvclock testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define NR_TEST_VCPUS 20
+
+static struct kvm_vm *vm;
+pthread_spinlock_t create_lock;
+
+#define TEST_TSC_KHZ 2345678UL
+#define TEST_TSC_OFFSET 200000000
+
+uint64_t tsc_sync;
+static void guest_code(void)
+{
+ uint64_t start_tsc, local_tsc, tmp;
+
+ start_tsc = rdtsc();
+ do {
+ tmp = READ_ONCE(tsc_sync);
+ local_tsc = rdtsc();
+ WRITE_ONCE(tsc_sync, local_tsc);
+ if (unlikely(local_tsc < tmp))
+ GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
+
+ } while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
+
+ GUEST_DONE();
+}
+
+
+static void *run_vcpu(void *_cpu_nr)
+{
+ unsigned long cpu = (unsigned long)_cpu_nr;
+ unsigned long failures = 0;
+ static bool first_cpu_done;
+
+ /* The kernel is fine, but vm_vcpu_add_default() needs locking */
+ pthread_spin_lock(&create_lock);
+
+ vm_vcpu_add_default(vm, cpu, guest_code);
+
+ if (!first_cpu_done) {
+ first_cpu_done = true;
+ vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
+ }
+
+ pthread_spin_unlock(&create_lock);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, cpu);
+ struct ucall uc;
+
+ vcpu_run(vm, cpu);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, cpu, &uc)) {
+ case UCALL_DONE:
+ goto out;
+
+ case UCALL_SYNC:
+ printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
+ failures++;
+ break;
+
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+ out:
+ return (void *)failures;
+}
+
+int main(int argc, char *argv[])
+{
+ if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) {
+ print_skip("KVM_CAP_VM_TSC_CONTROL not available");
+ exit(KSFT_SKIP);
+ }
+
+ vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL);
+ vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
+
+ pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
+ pthread_t cpu_threads[NR_TEST_VCPUS];
+ unsigned long cpu;
+ for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
+ pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
+
+ unsigned long failures = 0;
+ for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
+ void *this_cpu_failures;
+ pthread_join(cpu_threads[cpu], &this_cpu_failures);
+ failures += (unsigned long)this_cpu_failures;
+ }
+
+ TEST_ASSERT(!failures, "TSC sync failed");
+ pthread_spin_destroy(&create_lock);
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 865e17146815..5e6f40633ea6 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -39,12 +39,36 @@
#define EVTCHN_VECTOR 0x10
+#define EVTCHN_TEST1 15
+#define EVTCHN_TEST2 66
+#define EVTCHN_TIMER 13
+
static struct kvm_vm *vm;
#define XEN_HYPERCALL_MSR 0x40000000
#define MIN_STEAL_TIME 50000
+#define __HYPERVISOR_set_timer_op 15
+#define __HYPERVISOR_sched_op 29
+#define __HYPERVISOR_event_channel_op 32
+
+#define SCHEDOP_poll 3
+
+#define EVTCHNOP_send 4
+
+#define EVTCHNSTAT_interdomain 2
+
+struct evtchn_send {
+ u32 port;
+};
+
+struct sched_poll {
+ u32 *ports;
+ unsigned int nr_ports;
+ u64 timeout;
+};
+
struct pvclock_vcpu_time_info {
u32 version;
u32 pad0;
@@ -107,15 +131,25 @@ struct {
struct kvm_irq_routing_entry entries[2];
} irq_routes;
+bool guest_saw_irq;
+
static void evtchn_handler(struct ex_regs *regs)
{
struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
vi->evtchn_upcall_pending = 0;
vi->evtchn_pending_sel = 0;
+ guest_saw_irq = true;
GUEST_SYNC(0x20);
}
+static void guest_wait_for_irq(void)
+{
+ while (!guest_saw_irq)
+ __asm__ __volatile__ ("rep nop" : : : "memory");
+ guest_saw_irq = false;
+}
+
static void guest_code(void)
{
struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
@@ -128,6 +162,8 @@ static void guest_code(void)
/* Trigger an interrupt injection */
GUEST_SYNC(0);
+ guest_wait_for_irq();
+
/* Test having the host set runstates manually */
GUEST_SYNC(RUNSTATE_runnable);
GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
@@ -168,14 +204,132 @@ static void guest_code(void)
/* Now deliver an *unmasked* interrupt */
GUEST_SYNC(8);
- while (!si->evtchn_pending[1])
- __asm__ __volatile__ ("rep nop" : : : "memory");
+ guest_wait_for_irq();
/* Change memslots and deliver an interrupt */
GUEST_SYNC(9);
- for (;;)
- __asm__ __volatile__ ("rep nop" : : : "memory");
+ guest_wait_for_irq();
+
+ /* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
+ GUEST_SYNC(10);
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(11);
+
+ /* Our turn. Deliver event channel (to ourselves) with
+ * EVTCHNOP_send hypercall. */
+ unsigned long rax;
+ struct evtchn_send s = { .port = 127 };
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_event_channel_op),
+ "D" (EVTCHNOP_send),
+ "S" (&s));
+
+ GUEST_ASSERT(rax == 0);
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(12);
+
+ /* Deliver "outbound" event channel to an eventfd which
+ * happens to be one of our own irqfds. */
+ s.port = 197;
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_event_channel_op),
+ "D" (EVTCHNOP_send),
+ "S" (&s));
+
+ GUEST_ASSERT(rax == 0);
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(13);
+
+ /* Set a timer 100ms in the future. */
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_set_timer_op),
+ "D" (rs->state_entry_time + 100000000));
+ GUEST_ASSERT(rax == 0);
+
+ GUEST_SYNC(14);
+
+ /* Now wait for the timer */
+ guest_wait_for_irq();
+
+ GUEST_SYNC(15);
+
+ /* The host has 'restored' the timer. Just wait for it. */
+ guest_wait_for_irq();
+
+ GUEST_SYNC(16);
+
+ /* Poll for an event channel port which is already set */
+ u32 ports[1] = { EVTCHN_TIMER };
+ struct sched_poll p = {
+ .ports = ports,
+ .nr_ports = 1,
+ .timeout = 0,
+ };
+
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p));
+
+ GUEST_ASSERT(rax == 0);
+
+ GUEST_SYNC(17);
+
+ /* Poll for an unset port and wait for the timeout. */
+ p.timeout = 100000000;
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p));
+
+ GUEST_ASSERT(rax == 0);
+
+ GUEST_SYNC(18);
+
+ /* A timer will wake the masked port we're waiting on, while we poll */
+ p.timeout = 0;
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p));
+
+ GUEST_ASSERT(rax == 0);
+
+ GUEST_SYNC(19);
+
+ /* A timer wake an *unmasked* port which should wake us with an
+ * actual interrupt, while we're polling on a different port. */
+ ports[0]++;
+ p.timeout = 0;
+ __asm__ __volatile__ ("vmcall" :
+ "=a" (rax) :
+ "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p));
+
+ GUEST_ASSERT(rax == 0);
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(20);
+
+ /* Timer should have fired already */
+ guest_wait_for_irq();
+
+ GUEST_SYNC(21);
}
static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -191,9 +345,13 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
else
return 0;
}
+struct vcpu_info *vinfo;
static void handle_alrm(int sig)
{
+ if (vinfo)
+ printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
+ vcpu_dump(stdout, vm, VCPU_ID, 0);
TEST_FAIL("IRQ delivery timed out");
}
@@ -213,6 +371,7 @@ int main(int argc, char *argv[])
bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
+ bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
clock_gettime(CLOCK_REALTIME, &min_ts);
@@ -233,6 +392,12 @@ int main(int argc, char *argv[])
.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
.msr = XEN_HYPERCALL_MSR,
};
+
+ /* Let the kernel know that we *will* use it for sending all
+ * event channels, which lets it intercept SCHEDOP_poll */
+ if (do_evtchn_tests)
+ hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
struct kvm_xen_hvm_attr lm = {
@@ -295,7 +460,7 @@ int main(int argc, char *argv[])
/* Unexpected, but not a KVM failure */
if (irq_fd[0] == -1 || irq_fd[1] == -1)
- do_eventfd_tests = false;
+ do_evtchn_tests = do_eventfd_tests = false;
}
if (do_eventfd_tests) {
@@ -303,13 +468,13 @@ int main(int argc, char *argv[])
irq_routes.entries[0].gsi = 32;
irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
- irq_routes.entries[0].u.xen_evtchn.port = 15;
+ irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
irq_routes.entries[1].gsi = 33;
irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
- irq_routes.entries[1].u.xen_evtchn.port = 66;
+ irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
@@ -330,7 +495,39 @@ int main(int argc, char *argv[])
sigaction(SIGALRM, &sa, NULL);
}
- struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+ struct kvm_xen_vcpu_attr tmr = {
+ .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
+ .u.timer.port = EVTCHN_TIMER,
+ .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+ .u.timer.expires_ns = 0
+ };
+
+ if (do_evtchn_tests) {
+ struct kvm_xen_hvm_attr inj = {
+ .type = KVM_XEN_ATTR_TYPE_EVTCHN,
+ .u.evtchn.send_port = 127,
+ .u.evtchn.type = EVTCHNSTAT_interdomain,
+ .u.evtchn.flags = 0,
+ .u.evtchn.deliver.port.port = EVTCHN_TEST1,
+ .u.evtchn.deliver.port.vcpu = VCPU_ID + 1,
+ .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+ };
+ vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+ /* Test migration to a different vCPU */
+ inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
+ inj.u.evtchn.deliver.port.vcpu = VCPU_ID;
+ vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+ inj.u.evtchn.send_port = 197;
+ inj.u.evtchn.deliver.eventfd.port = 0;
+ inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
+ inj.u.evtchn.flags = 0;
+ vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ }
+ vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
vinfo->evtchn_upcall_pending = 0;
struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
@@ -423,7 +620,7 @@ int main(int argc, char *argv[])
goto done;
if (verbose)
printf("Testing masked event channel\n");
- shinfo->evtchn_mask[0] = 0x8000;
+ shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
eventfd_write(irq_fd[0], 1UL);
alarm(1);
break;
@@ -440,6 +637,9 @@ int main(int argc, char *argv[])
break;
case 9:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ shinfo->evtchn_pending[1] = 0;
if (verbose)
printf("Testing event channel after memslot change\n");
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
@@ -449,12 +649,153 @@ int main(int argc, char *argv[])
alarm(1);
break;
+ case 10:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ if (!do_evtchn_tests)
+ goto done;
+
+ shinfo->evtchn_pending[0] = 0;
+ if (verbose)
+ printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
+
+ struct kvm_irq_routing_xen_evtchn e;
+ e.port = EVTCHN_TEST2;
+ e.vcpu = VCPU_ID;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
+ evtchn_irq_expected = true;
+ alarm(1);
+ break;
+
+ case 11:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ shinfo->evtchn_pending[1] = 0;
+
+ if (verbose)
+ printf("Testing guest EVTCHNOP_send direct to evtchn\n");
+ evtchn_irq_expected = true;
+ alarm(1);
+ break;
+
+ case 12:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ shinfo->evtchn_pending[0] = 0;
+
+ if (verbose)
+ printf("Testing guest EVTCHNOP_send to eventfd\n");
+ evtchn_irq_expected = true;
+ alarm(1);
+ break;
+
+ case 13:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ shinfo->evtchn_pending[1] = 0;
+
+ if (verbose)
+ printf("Testing guest oneshot timer\n");
+ break;
+
+ case 14:
+ memset(&tmr, 0, sizeof(tmr));
+ tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+ TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
+ "Timer port not returned");
+ TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+ "Timer priority not returned");
+ TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
+ "Timer expiry not returned");
+ evtchn_irq_expected = true;
+ alarm(1);
+ break;
+
+ case 15:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ shinfo->evtchn_pending[0] = 0;
+
+ if (verbose)
+ printf("Testing restored oneshot timer\n");
+
+ tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ evtchn_irq_expected = true;
+ alarm(1);
+ break;
+
+ case 16:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+
+ if (verbose)
+ printf("Testing SCHEDOP_poll with already pending event\n");
+ shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
+ alarm(1);
+ break;
+
+ case 17:
+ if (verbose)
+ printf("Testing SCHEDOP_poll timeout\n");
+ shinfo->evtchn_pending[0] = 0;
+ alarm(1);
+ break;
+
+ case 18:
+ if (verbose)
+ printf("Testing SCHEDOP_poll wake on masked event\n");
+
+ tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ alarm(1);
+ break;
+
+ case 19:
+ shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
+ if (verbose)
+ printf("Testing SCHEDOP_poll wake on unmasked event\n");
+
+ evtchn_irq_expected = true;
+ tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+
+ /* Read it back and check the pending time is reported correctly */
+ tmr.u.timer.expires_ns = 0;
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+ TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
+ "Timer not reported pending");
+ alarm(1);
+ break;
+
+ case 20:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ /* Read timer and check it is no longer pending */
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+ TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
+
+ shinfo->evtchn_pending[0] = 0;
+ if (verbose)
+ printf("Testing timer in the past\n");
+
+ evtchn_irq_expected = true;
+ tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ alarm(1);
+ break;
+
+ case 21:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+ goto done;
+
case 0x20:
TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
evtchn_irq_expected = false;
- if (shinfo->evtchn_pending[1] &&
- shinfo->evtchn_pending[0])
- goto done;
break;
}
break;
@@ -467,6 +808,7 @@ int main(int argc, char *argv[])
}
done:
+ alarm(0);
clock_gettime(CLOCK_REALTIME, &max_ts);
/*