summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-05-29 06:02:25 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2021-05-29 06:02:25 -1000
commit224478289ca0e7abf06a3bc63b06c42a2bf84c69 (patch)
treed118312db6be190a9d03ba2e15180cfadf9977b5
parent866c4b8a18e26b7ae41c45b1af57c82a66089985 (diff)
parent000ac42953395a4f0a63d5db640c5e4c88a548c5 (diff)
downloadlinux-224478289ca0e7abf06a3bc63b06c42a2bf84c69.tar.bz2
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "ARM fixes: - Another state update on exit to userspace fix - Prevent the creation of mixed 32/64 VMs - Fix regression with irqbypass not restarting the guest on failed connect - Fix regression with debug register decoding resulting in overlapping access - Commit exception state on exit to usrspace - Fix the MMU notifier return values - Add missing 'static' qualifiers in the new host stage-2 code x86 fixes: - fix guest missed wakeup with assigned devices - fix WARN reported by syzkaller - do not use BIT() in UAPI headers - make the kvm_amd.avic parameter bool PPC fixes: - make halt polling heuristics consistent with other architectures selftests: - various fixes - new performance selftest memslot_perf_test - test UFFD minor faults in demand_paging_test" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (44 commits) selftests: kvm: fix overlapping addresses in memslot_perf_test KVM: X86: Kill off ctxt->ud KVM: X86: Fix warning caused by stale emulation context KVM: X86: Use kvm_get_linear_rip() in single-step and #DB/#BP interception KVM: x86/mmu: Fix comment mentioning skip_4k KVM: VMX: update vcpu posted-interrupt descriptor when assigning device KVM: rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK KVM: x86: add start_assignment hook to kvm_x86_ops KVM: LAPIC: Narrow the timer latency between wait_lapic_expire and world switch selftests: kvm: do only 1 memslot_perf_test run by default KVM: X86: Use _BITUL() macro in UAPI headers KVM: selftests: add shared hugetlbfs backing source type KVM: selftests: allow using UFFD minor faults for demand paging KVM: selftests: create alias mappings when using shared memory KVM: selftests: add shmem backing source type KVM: selftests: refactor vm_mem_backing_src_type flags KVM: selftests: allow different backing source types KVM: selftests: compute correct demand paging size KVM: selftests: simplify setup_demand_paging error handling KVM: selftests: Print a message if /dev/kvm is missing ...
-rw-r--r--Documentation/virt/kvm/vcpu-requests.rst8
-rw-r--r--arch/arm64/include/asm/kvm_asm.h3
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm64/kvm/arm.c20
-rw-r--r--arch/arm64/kvm/hyp/exception.c18
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/adjust_pc.h18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c8
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c3
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c3
-rw-r--r--arch/arm64/kvm/mmu.c12
-rw-r--r--arch/arm64/kvm/reset.c28
-rw-r--r--arch/arm64/kvm/sys_regs.c42
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/kvm_emulate.h3
-rw-r--r--arch/x86/kvm/lapic.c16
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/svm/avic.c6
-rw-r--r--arch/x86/kvm/svm/svm.c4
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/vmx/capabilities.h3
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c14
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c6
-rw-r--r--arch/x86/kvm/x86.c27
-rw-r--r--include/linux/kvm_host.h8
-rw-r--r--include/uapi/linux/kvm.h5
-rw-r--r--tools/include/uapi/linux/kvm.h5
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile3
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c174
-rw-r--r--tools/testing/selftests/kvm/hardware_disable_test.c32
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h4
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h12
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c278
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util_internal.h17
-rw-r--r--tools/testing/selftests/kvm/lib/perf_test_util.c4
-rw-r--r--tools/testing/selftests/kvm/lib/rbtree.c1
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c51
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c16
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c18
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c1037
-rw-r--r--tools/testing/selftests/kvm/x86_64/get_cpuid_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86_64/get_msr_index_features.c8
-rw-r--r--virt/kvm/kvm_main.c6
-rw-r--r--virt/lib/irqbypass.c16
52 files changed, 1694 insertions, 287 deletions
diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst
index 5feb3706a7ae..af1b37441e0a 100644
--- a/Documentation/virt/kvm/vcpu-requests.rst
+++ b/Documentation/virt/kvm/vcpu-requests.rst
@@ -118,10 +118,12 @@ KVM_REQ_MMU_RELOAD
necessary to inform each VCPU to completely refresh the tables. This
request is used for that.
-KVM_REQ_PENDING_TIMER
+KVM_REQ_UNBLOCK
- This request may be made from a timer handler run on the host on behalf
- of a VCPU. It informs the VCPU thread to inject a timer interrupt.
+ This request informs the vCPU to exit kvm_vcpu_block. It is used for
+ example from timer handlers that run on the host on behalf of a vCPU,
+ or in order to update the interrupt routing and ensure that assigned
+ devices will wake up the vCPU.
KVM_REQ_UNHALT
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index cf8df032b9c3..5e9b33cbac51 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -63,6 +63,7 @@
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
+#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21
#ifndef __ASSEMBLY__
@@ -201,6 +202,8 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu);
+
extern u64 __vgic_v3_get_gic_config(void);
extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index f612c090f2e4..01b9857757f2 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -463,4 +463,9 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC;
}
+static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
+{
+ return test_bit(feature, vcpu->arch.features);
+}
+
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1cb39c0803a4..e720148232a0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -720,11 +720,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
return ret;
}
- if (run->immediate_exit)
- return -EINTR;
-
vcpu_load(vcpu);
+ if (run->immediate_exit) {
+ ret = -EINTR;
+ goto out;
+ }
+
kvm_sigset_activate(vcpu);
ret = 1;
@@ -897,6 +899,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_sigset_deactivate(vcpu);
+out:
+ /*
+ * In the unlikely event that we are returning to userspace
+ * with pending exceptions or PC adjustment, commit these
+ * adjustments in order to give userspace a consistent view of
+ * the vcpu state. Note that this relies on __kvm_adjust_pc()
+ * being preempt-safe on VHE.
+ */
+ if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION |
+ KVM_ARM64_INCREMENT_PC)))
+ kvm_call_hyp(__kvm_adjust_pc, vcpu);
+
vcpu_put(vcpu);
return ret;
}
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 73629094f903..11541b94b328 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -296,7 +296,7 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
*vcpu_pc(vcpu) = vect_offset;
}
-void kvm_inject_exception(struct kvm_vcpu *vcpu)
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
if (vcpu_el1_is_32bit(vcpu)) {
switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) {
@@ -329,3 +329,19 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu)
}
}
}
+
+/*
+ * Adjust the guest PC (and potentially exception state) depending on
+ * flags provided by the emulation code.
+ */
+void __kvm_adjust_pc(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) {
+ kvm_inject_exception(vcpu);
+ vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION |
+ KVM_ARM64_EXCEPT_MASK);
+ } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) {
+ kvm_skip_instr(vcpu);
+ vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC;
+ }
+}
diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h
index 61716359035d..4fdfeabefeb4 100644
--- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h
+++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h
@@ -13,8 +13,6 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
-void kvm_inject_exception(struct kvm_vcpu *vcpu);
-
static inline void kvm_skip_instr(struct kvm_vcpu *vcpu)
{
if (vcpu_mode_is_32bit(vcpu)) {
@@ -44,22 +42,6 @@ static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu)
}
/*
- * Adjust the guest PC on entry, depending on flags provided by EL1
- * for the purpose of emulation (MMIO, sysreg) or exception injection.
- */
-static inline void __adjust_pc(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) {
- kvm_inject_exception(vcpu);
- vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION |
- KVM_ARM64_EXCEPT_MASK);
- } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) {
- kvm_skip_instr(vcpu);
- vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC;
- }
-}
-
-/*
* Skip an instruction while host sysregs are live.
* Assumes host is always 64-bit.
*/
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f36420a80474..1632f001f4ed 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -28,6 +28,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu));
}
+static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+
+ __kvm_adjust_pc(kern_hyp_va(vcpu));
+}
+
static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)
{
__kvm_flush_vm_context();
@@ -170,6 +177,7 @@ typedef void (*hcall_t)(struct kvm_cpu_context *);
static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
+ HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index e342f7f4f4fb..4b60c0056c04 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -23,8 +23,8 @@
extern unsigned long hyp_nr_cpus;
struct host_kvm host_kvm;
-struct hyp_pool host_s2_mem;
-struct hyp_pool host_s2_dev;
+static struct hyp_pool host_s2_mem;
+static struct hyp_pool host_s2_dev;
/*
* Copies of the host's CPU features registers holding sanitized values.
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 7488f53b0aa2..a3d3a275344e 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -17,7 +17,6 @@
#include <nvhe/trap_handler.h>
struct hyp_pool hpool;
-struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
unsigned long hyp_nr_cpus;
#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
@@ -27,6 +26,7 @@ static void *vmemmap_base;
static void *hyp_pgt_base;
static void *host_s2_mem_pgt_base;
static void *host_s2_dev_pgt_base;
+static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static int divide_memory_pool(void *virt, unsigned long size)
{
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index e9f6ea704d07..f7af9688c1f7 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -4,7 +4,6 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
-#include <hyp/adjust_pc.h>
#include <hyp/switch.h>
#include <hyp/sysreg-sr.h>
@@ -201,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
__debug_save_host_buffers_nvhe(vcpu);
- __adjust_pc(vcpu);
+ __kvm_adjust_pc(vcpu);
/*
* We must restore the 32-bit state before the sysregs, thanks
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 7b8f7db5c1ed..b3229924d243 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -4,7 +4,6 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
-#include <hyp/adjust_pc.h>
#include <hyp/switch.h>
#include <linux/arm-smccc.h>
@@ -132,7 +131,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
- __adjust_pc(vcpu);
+ __kvm_adjust_pc(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c5d1f3c87dbd..c10207fed2f3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1156,13 +1156,13 @@ out_unlock:
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
- return 0;
+ return false;
__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
- return 0;
+ return false;
}
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1170,7 +1170,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
kvm_pfn_t pfn = pte_pfn(range->pte);
if (!kvm->arch.mmu.pgt)
- return 0;
+ return false;
WARN_ON(range->end - range->start != 1);
@@ -1190,7 +1190,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
PAGE_SIZE, __pfn_to_phys(pfn),
KVM_PGTABLE_PROT_R, NULL);
- return 0;
+ return false;
}
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1200,7 +1200,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t pte;
if (!kvm->arch.mmu.pgt)
- return 0;
+ return false;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
@@ -1213,7 +1213,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
- return 0;
+ return false;
return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 956cdc240148..d37ebee085cf 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -166,6 +166,25 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
return 0;
}
+static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu *tmp;
+ bool is32bit;
+ int i;
+
+ is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
+ if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
+ return false;
+
+ /* Check that the vcpus are either all 32bit or all 64bit */
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+ if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit)
+ return false;
+ }
+
+ return true;
+}
+
/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
@@ -217,13 +236,14 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
}
}
+ if (!vcpu_allowed_register_width(vcpu)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
switch (vcpu->arch.target) {
default:
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
- if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) {
- ret = -EINVAL;
- goto out;
- }
pstate = VCPU_RESET_PSTATE_SVC;
} else {
pstate = VCPU_RESET_PSTATE_EL1;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76ea2800c33e..1a7968ad078c 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -399,14 +399,14 @@ static bool trap_bvr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *rd)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+ u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
if (p->is_write)
reg_to_dbg(vcpu, p, rd, dbg_reg);
else
dbg_to_reg(vcpu, p, rd, dbg_reg);
- trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+ trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
return true;
}
@@ -414,7 +414,7 @@ static bool trap_bvr(struct kvm_vcpu *vcpu,
static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -424,7 +424,7 @@ static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -434,21 +434,21 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static void reset_bvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
+ vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
}
static bool trap_bcr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *rd)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+ u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
if (p->is_write)
reg_to_dbg(vcpu, p, rd, dbg_reg);
else
dbg_to_reg(vcpu, p, rd, dbg_reg);
- trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+ trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
return true;
}
@@ -456,7 +456,7 @@ static bool trap_bcr(struct kvm_vcpu *vcpu,
static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -467,7 +467,7 @@ static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -477,22 +477,22 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static void reset_bcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
+ vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
}
static bool trap_wvr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *rd)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+ u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
if (p->is_write)
reg_to_dbg(vcpu, p, rd, dbg_reg);
else
dbg_to_reg(vcpu, p, rd, dbg_reg);
- trace_trap_reg(__func__, rd->reg, p->is_write,
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
+ trace_trap_reg(__func__, rd->CRm, p->is_write,
+ vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]);
return true;
}
@@ -500,7 +500,7 @@ static bool trap_wvr(struct kvm_vcpu *vcpu,
static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -510,7 +510,7 @@ static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -520,21 +520,21 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static void reset_wvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
+ vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
}
static bool trap_wcr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *rd)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+ u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
if (p->is_write)
reg_to_dbg(vcpu, p, rd, dbg_reg);
else
dbg_to_reg(vcpu, p, rd, dbg_reg);
- trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+ trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
return true;
}
@@ -542,7 +542,7 @@ static bool trap_wcr(struct kvm_vcpu *vcpu,
static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -552,7 +552,7 @@ static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
- __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+ __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
return -EFAULT;
@@ -562,7 +562,7 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
static void reset_wcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
+ vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
}
static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1e83359f286b..7f2e90db2050 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -51,6 +51,7 @@
/* PPC-specific vcpu->requests bit members */
#define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0)
#define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1)
+#define KVM_REQ_PENDING_TIMER KVM_ARCH_REQ(2)
#include <linux/mmu_notifier.h>
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 28a80d240b76..7360350e66ff 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3936,7 +3936,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
break;
}
cur = ktime_get();
- } while (single_task_running() && ktime_before(cur, stop));
+ } while (kvm_vcpu_can_poll(cur, stop));
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 323641097f63..e7bef91cee04 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(start_assignment)
KVM_X86_OP_NULL(apicv_post_state_restore)
KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
KVM_X86_OP_NULL(set_hv_timer)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55efbacfc244..9c7ced0e3171 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1352,6 +1352,7 @@ struct kvm_x86_ops {
int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
+ void (*start_assignment)(struct kvm *kvm);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8a0ccdb56076..5e5de05a8fbf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5111,7 +5111,7 @@ done:
return rc;
}
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
@@ -5322,7 +5322,8 @@ done_prefixes:
ctxt->execute = opcode.u.execute;
- if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
return EMULATION_FAILED;
if (unlikely(ctxt->d &
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f98370a39936..f00830e5202f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
u64 gfn;
+ int idx;
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
@@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
hv->tsc_ref.tsc_sequence = 0;
+
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&kvm->srcu);
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+ srcu_read_unlock(&kvm->srcu, idx);
out_unlock:
mutex_unlock(&hv->hv_lock);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index f016838faedd..3e870bf9ca4d 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -314,7 +314,6 @@ struct x86_emulate_ctxt {
int interruptibility;
bool perm_ok; /* do not check permissions if true */
- bool ud; /* inject an #UD if host doesn't support insn */
bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
@@ -491,7 +490,7 @@ enum x86_intercept {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c0ebef560bd1..8120e8614b92 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1598,11 +1598,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
+ if (lapic_timer_advance_dynamic) {
+ adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
+ /*
+ * If the timer fired early, reread the TSC to account for the
+ * overhead of the above adjustment to avoid waiting longer
+ * than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ }
+
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
-
- if (lapic_timer_advance_dynamic)
- adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1661,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
atomic_inc(&apic->lapic_timer.pending);
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
if (from_timer_fn)
kvm_vcpu_kick(vcpu);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 95eeb5ac6a8a..237317b1eddd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1192,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
}
/*
- * Remove write access from all the SPTEs mapping GFNs [start, end). If
- * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
*/
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, int min_level)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 712b4e0de481..0e62e6a2438c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -28,10 +28,8 @@
#include "svm.h"
/* enable / disable AVIC */
-int avic;
-#ifdef CONFIG_X86_LOCAL_APIC
-module_param(avic, int, S_IRUGO);
-#endif
+bool avic;
+module_param(avic, bool, S_IRUGO);
#define SVM_AVIC_DOORBELL 0xc001011b
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 05eca131eaf2..e088086f3de6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1010,9 +1010,7 @@ static __init int svm_hardware_setup(void)
}
if (avic) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) {
avic = false;
} else {
pr_info("AVIC enabled\n");
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2c9ece618b29..2908c6ab5bb4 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -480,7 +480,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-extern int avic;
+extern bool avic;
static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
{
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8dee8a5fbc17..aa0e7872fcc9 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
- vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_load_ia32_efer(void)
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 459748680daf..5f81ef092bd4 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -238,6 +238,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in pi_pre_block().
+ */
+void vmx_pi_start_assignment(struct kvm *kvm)
+{
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
+}
+
+/*
* pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 0bdc41391c5b..7f7b2326caf5 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4bceb5ca3a89..50b42d7a8a11 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4843,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
+ unsigned long cr2, dr6;
u32 vect_info;
vect_info = vmx->idt_vectoring_info;
@@ -4933,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- rip = kvm_rip_read(vcpu);
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
@@ -7721,6 +7720,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.update_pi_irte = pi_update_irte,
+ .start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbc4e04e67ad..b594275d49b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3105,6 +3105,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
st->preempted & KVM_VCPU_FLUSH_TLB);
if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+ } else {
+ st->preempted = 0;
}
vcpu->arch.st.preempted = 0;
@@ -7226,6 +7228,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
@@ -7561,14 +7568,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
kvm_vcpu_check_breakpoint(vcpu, &r))
return r;
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
@@ -8360,6 +8360,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
vcpu->stat.directed_yield_attempted++;
+ if (single_task_running())
+ goto no_yield;
+
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
@@ -9496,7 +9499,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (r <= 0)
break;
- kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -10115,8 +10118,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -11499,7 +11501,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.assigned_device_count);
+ if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+ static_call_cond(kvm_x86_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f34487e21f2..76102efbf079 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
+#include <linux/sched/stat.h>
#include <linux/bug.h>
#include <linux/minmax.h>
#include <linux/mm.h>
@@ -146,7 +147,7 @@ static inline bool is_error_page(struct page *page)
*/
#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_PENDING_TIMER 2
+#define KVM_REQ_UNBLOCK 2
#define KVM_REQ_UNHALT 3
#define KVM_REQUEST_ARCH_BASE 8
@@ -265,6 +266,11 @@ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
return !!map->hva;
}
+static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
+{
+ return single_task_running() && !need_resched() && ktime_before(cur, stop);
+}
+
/*
* Sometimes a large or cross-page mmio needs to be broken up into separate
* exits for userspace servicing.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..79d9c44d1ad7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -8,6 +8,7 @@
* Note: you must update KVM_API_VERSION if you change this interface.
*/
+#include <linux/const.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/ioctl.h>
@@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd {
* conversion after harvesting an entry. Also, it must not skip any
* dirty bits, so that dirty bits are always harvested in sequence.
*/
-#define KVM_DIRTY_GFN_F_DIRTY BIT(0)
-#define KVM_DIRTY_GFN_F_RESET BIT(1)
+#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0)
+#define KVM_DIRTY_GFN_F_RESET _BITUL(1)
#define KVM_DIRTY_GFN_F_MASK 0x3
/*
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..79d9c44d1ad7 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -8,6 +8,7 @@
* Note: you must update KVM_API_VERSION if you change this interface.
*/
+#include <linux/const.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/ioctl.h>
@@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd {
* conversion after harvesting an entry. Also, it must not skip any
* dirty bits, so that dirty bits are always harvested in sequence.
*/
-#define KVM_DIRTY_GFN_F_DIRTY BIT(0)
-#define KVM_DIRTY_GFN_F_RESET BIT(1)
+#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0)
+#define KVM_DIRTY_GFN_F_RESET _BITUL(1)
#define KVM_DIRTY_GFN_F_MASK 0x3
/*
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index bd83158e0e0b..524c857a049c 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -41,5 +41,6 @@
/kvm_create_max_vcpus
/kvm_page_table_test
/memslot_modification_stress_test
+/memslot_perf_test
/set_memory_region_test
/steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index e439d027939d..daaee1888b12 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -33,7 +33,7 @@ ifeq ($(ARCH),s390)
UNAME_M := s390x
endif
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
@@ -74,6 +74,7 @@ TEST_GEN_PROGS_x86_64 += hardware_disable_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
TEST_GEN_PROGS_x86_64 += kvm_page_table_test
TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
+TEST_GEN_PROGS_x86_64 += memslot_perf_test
TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..b74704305835 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -9,6 +9,7 @@
#define _GNU_SOURCE /* for pipe2 */
+#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
@@ -38,6 +39,7 @@
static int nr_vcpus = 1;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+static size_t demand_paging_size;
static char *guest_data_prototype;
static void *vcpu_worker(void *data)
@@ -71,36 +73,51 @@ static void *vcpu_worker(void *data)
return NULL;
}
-static int handle_uffd_page_request(int uffd, uint64_t addr)
+static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
{
- pid_t tid;
+ pid_t tid = syscall(__NR_gettid);
struct timespec start;
struct timespec ts_diff;
- struct uffdio_copy copy;
int r;
- tid = syscall(__NR_gettid);
+ clock_gettime(CLOCK_MONOTONIC, &start);
- copy.src = (uint64_t)guest_data_prototype;
- copy.dst = addr;
- copy.len = perf_test_args.host_page_size;
- copy.mode = 0;
+ if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) {
+ struct uffdio_copy copy;
- clock_gettime(CLOCK_MONOTONIC, &start);
+ copy.src = (uint64_t)guest_data_prototype;
+ copy.dst = addr;
+ copy.len = demand_paging_size;
+ copy.mode = 0;
- r = ioctl(uffd, UFFDIO_COPY, &copy);
- if (r == -1) {
- pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n",
- addr, tid, errno);
- return r;
+ r = ioctl(uffd, UFFDIO_COPY, &copy);
+ if (r == -1) {
+ pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n",
+ addr, tid, errno);
+ return r;
+ }
+ } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) {
+ struct uffdio_continue cont = {0};
+
+ cont.range.start = addr;
+ cont.range.len = demand_paging_size;
+
+ r = ioctl(uffd, UFFDIO_CONTINUE, &cont);
+ if (r == -1) {
+ pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n",
+ addr, tid, errno);
+ return r;
+ }
+ } else {
+ TEST_FAIL("Invalid uffd mode %d", uffd_mode);
}
ts_diff = timespec_elapsed(start);
- PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
+ PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid,
timespec_to_ns(ts_diff));
PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
- perf_test_args.host_page_size, addr, tid);
+ demand_paging_size, addr, tid);
return 0;
}
@@ -108,6 +125,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
bool quit_uffd_thread;
struct uffd_handler_args {
+ int uffd_mode;
int uffd;
int pipefd;
useconds_t delay;
@@ -169,7 +187,7 @@ static void *uffd_handler_thread_fn(void *arg)
if (r == -1) {
if (errno == EAGAIN)
continue;
- pr_info("Read of uffd gor errno %d", errno);
+ pr_info("Read of uffd got errno %d\n", errno);
return NULL;
}
@@ -184,7 +202,7 @@ static void *uffd_handler_thread_fn(void *arg)
if (delay)
usleep(delay);
addr = msg.arg.pagefault.address;
- r = handle_uffd_page_request(uffd, addr);
+ r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr);
if (r < 0)
return NULL;
pages++;
@@ -198,43 +216,53 @@ static void *uffd_handler_thread_fn(void *arg)
return NULL;
}
-static int setup_demand_paging(struct kvm_vm *vm,
- pthread_t *uffd_handler_thread, int pipefd,
- useconds_t uffd_delay,
- struct uffd_handler_args *uffd_args,
- void *hva, uint64_t len)
+static void setup_demand_paging(struct kvm_vm *vm,
+ pthread_t *uffd_handler_thread, int pipefd,
+ int uffd_mode, useconds_t uffd_delay,
+ struct uffd_handler_args *uffd_args,
+ void *hva, void *alias, uint64_t len)
{
+ bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
int uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
+ uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
- uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
- if (uffd == -1) {
- pr_info("uffd creation failed\n");
- return -1;
+ PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
+ is_minor ? "MINOR" : "MISSING",
+ is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
+
+ /* In order to get minor faults, prefault via the alias. */
+ if (is_minor) {
+ size_t p;
+
+ expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
+
+ TEST_ASSERT(alias != NULL, "Alias required for minor faults");
+ for (p = 0; p < (len / demand_paging_size); ++p) {
+ memcpy(alias + (p * demand_paging_size),
+ guest_data_prototype, demand_paging_size);
+ }
}
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
+
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
- if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
- pr_info("ioctl uffdio_api failed\n");
- return -1;
- }
+ TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
+ "ioctl UFFDIO_API failed: %" PRIu64,
+ (uint64_t)uffdio_api.api);
uffdio_register.range.start = (uint64_t)hva;
uffdio_register.range.len = len;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
- pr_info("ioctl uffdio_register failed\n");
- return -1;
- }
-
- if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
- UFFD_API_RANGE_IOCTLS) {
- pr_info("unexpected userfaultfd ioctl set\n");
- return -1;
- }
+ uffdio_register.mode = uffd_mode;
+ TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
+ "ioctl UFFDIO_REGISTER failed");
+ TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
+ expected_ioctls, "missing userfaultfd ioctls");
+ uffd_args->uffd_mode = uffd_mode;
uffd_args->uffd = uffd;
uffd_args->pipefd = pipefd;
uffd_args->delay = uffd_delay;
@@ -243,13 +271,12 @@ static int setup_demand_paging(struct kvm_vm *vm,
PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
hva, hva + len);
-
- return 0;
}
struct test_params {
- bool use_uffd;
+ int uffd_mode;
useconds_t uffd_delay;
+ enum vm_mem_backing_src_type src_type;
bool partition_vcpu_memory_access;
};
@@ -267,14 +294,16 @@ static void run_test(enum vm_guest_mode mode, void *arg)
int r;
vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
- VM_MEM_SRC_ANONYMOUS);
+ p->src_type);
perf_test_args.wr_fract = 1;
- guest_data_prototype = malloc(perf_test_args.host_page_size);
+ demand_paging_size = get_backing_src_pagesz(p->src_type);
+
+ guest_data_prototype = malloc(demand_paging_size);
TEST_ASSERT(guest_data_prototype,
"Failed to allocate buffer for guest data pattern");
- memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
+ memset(guest_data_prototype, 0xAB, demand_paging_size);
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
@@ -282,7 +311,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
p->partition_vcpu_memory_access);
- if (p->use_uffd) {
+ if (p->uffd_mode) {
uffd_handler_threads =
malloc(nr_vcpus * sizeof(*uffd_handler_threads));
TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
@@ -296,6 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vm_paddr_t vcpu_gpa;
void *vcpu_hva;
+ void *vcpu_alias;
uint64_t vcpu_mem_size;
@@ -310,8 +340,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
- /* Cache the HVA pointer of the region */
+ /* Cache the host addresses of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
+ vcpu_alias = addr_gpa2alias(vm, vcpu_gpa);
/*
* Set up user fault fd to handle demand paging
@@ -321,13 +352,11 @@ static void run_test(enum vm_guest_mode mode, void *arg)
O_CLOEXEC | O_NONBLOCK);
TEST_ASSERT(!r, "Failed to set up pipefd");
- r = setup_demand_paging(vm,
- &uffd_handler_threads[vcpu_id],
- pipefds[vcpu_id * 2],
- p->uffd_delay, &uffd_args[vcpu_id],
- vcpu_hva, vcpu_mem_size);
- if (r < 0)
- exit(-r);
+ setup_demand_paging(vm, &uffd_handler_threads[vcpu_id],
+ pipefds[vcpu_id * 2], p->uffd_mode,
+ p->uffd_delay, &uffd_args[vcpu_id],
+ vcpu_hva, vcpu_alias,
+ vcpu_mem_size);
}
}
@@ -355,7 +384,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("All vCPU threads joined\n");
- if (p->use_uffd) {
+ if (p->uffd_mode) {
char c;
/* Tell the user fault fd handler threads to quit */
@@ -377,7 +406,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
free(guest_data_prototype);
free(vcpu_threads);
- if (p->use_uffd) {
+ if (p->uffd_mode) {
free(uffd_handler_threads);
free(uffd_args);
free(pipefds);
@@ -387,17 +416,19 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
- " [-b memory] [-v vcpus] [-o]\n", name);
+ printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n"
+ " [-b memory] [-t type] [-v vcpus] [-o]\n", name);
guest_modes_help();
- printf(" -u: use User Fault FD to handle vCPU page\n"
- " faults.\n");
+ printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
+ " UFFD registration mode: 'MISSING' or 'MINOR'.\n");
printf(" -d: add a delay in usec to the User Fault\n"
" FD handler to simulate demand paging\n"
" overheads. Ignored without -u.\n");
printf(" -b: specify the size of the memory region which should be\n"
" demand paged by each vCPU. e.g. 10M or 3G.\n"
" Default: 1G\n");
+ printf(" -t: The type of backing memory to use. Default: anonymous\n");
+ backing_src_help();
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
@@ -409,19 +440,24 @@ int main(int argc, char *argv[])
{
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
struct test_params p = {
+ .src_type = VM_MEM_SRC_ANONYMOUS,
.partition_vcpu_memory_access = true,
};
int opt;
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
break;
case 'u':
- p.use_uffd = true;
+ if (!strcmp("MISSING", optarg))
+ p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+ else if (!strcmp("MINOR", optarg))
+ p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR;
+ TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'.");
break;
case 'd':
p.uffd_delay = strtoul(optarg, NULL, 0);
@@ -430,6 +466,9 @@ int main(int argc, char *argv[])
case 'b':
guest_percpu_mem_size = parse_size(optarg);
break;
+ case 't':
+ p.src_type = parse_backing_src_type(optarg);
+ break;
case 'v':
nr_vcpus = atoi(optarg);
TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
@@ -445,6 +484,11 @@ int main(int argc, char *argv[])
}
}
+ if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR &&
+ !backing_src_is_shared(p.src_type)) {
+ TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t");
+ }
+
for_each_guest_mode(run_test, &p);
return 0;
diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c
index 5aadf84c91c0..4b8db3bce610 100644
--- a/tools/testing/selftests/kvm/hardware_disable_test.c
+++ b/tools/testing/selftests/kvm/hardware_disable_test.c
@@ -132,6 +132,36 @@ static void run_test(uint32_t run)
TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run);
}
+void wait_for_child_setup(pid_t pid)
+{
+ /*
+ * Wait for the child to post to the semaphore, but wake up periodically
+ * to check if the child exited prematurely.
+ */
+ for (;;) {
+ const struct timespec wait_period = { .tv_sec = 1 };
+ int status;
+
+ if (!sem_timedwait(sem, &wait_period))
+ return;
+
+ /* Child is still running, keep waiting. */
+ if (pid != waitpid(pid, &status, WNOHANG))
+ continue;
+
+ /*
+ * Child is no longer running, which is not expected.
+ *
+ * If it exited with a non-zero status, we explicitly forward
+ * the child's status in case it exited with KSFT_SKIP.
+ */
+ if (WIFEXITED(status))
+ exit(WEXITSTATUS(status));
+ else
+ TEST_ASSERT(false, "Child exited unexpectedly");
+ }
+}
+
int main(int argc, char **argv)
{
uint32_t i;
@@ -148,7 +178,7 @@ int main(int argc, char **argv)
run_test(i); /* This function always exits */
pr_debug("%s: [%d] waiting semaphore\n", __func__, i);
- sem_wait(sem);
+ wait_for_child_setup(pid);
r = (rand() % DELAY_US_MAX) + 1;
pr_debug("%s: [%d] waiting %dus\n", __func__, i, r);
usleep(r);
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a8f022794ce3..fcd8e3855111 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -77,6 +77,7 @@ struct vm_guest_mode_params {
};
extern const struct vm_guest_mode_params vm_guest_mode_params[];
+int open_kvm_dev_path_or_exit(void);
int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
@@ -146,6 +147,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
/*
* Address Guest Virtual to Guest Physical
@@ -302,7 +304,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm);
-unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+uint64_t vm_get_max_gfn(struct kvm_vm *vm);
int vm_get_fd(struct kvm_vm *vm);
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index fade3130eb01..d79be15dd3d2 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -17,6 +17,7 @@
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
+#include <sys/mman.h>
#include "kselftest.h"
static inline int _no_printf(const char *format, ...) { return 0; }
@@ -84,6 +85,8 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+ VM_MEM_SRC_SHMEM,
+ VM_MEM_SRC_SHARED_HUGETLB,
NUM_SRC_TYPES,
};
@@ -100,4 +103,13 @@ size_t get_backing_src_pagesz(uint32_t i);
void backing_src_help(void);
enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
+/*
+ * Whether or not the given source type is shared memory (as opposed to
+ * anonymous).
+ */
+static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t)
+{
+ return vm_mem_backing_src_alias(t)->flag & MAP_SHARED;
+}
+
#endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index fc83f6c5902d..28e528c19d28 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -32,6 +32,34 @@ static void *align(void *x, size_t size)
}
/*
+ * Open KVM_DEV_PATH if available, otherwise exit the entire program.
+ *
+ * Input Args:
+ * flags - The flags to pass when opening KVM_DEV_PATH.
+ *
+ * Return:
+ * The opened file descriptor of /dev/kvm.
+ */
+static int _open_kvm_dev_path_or_exit(int flags)
+{
+ int fd;
+
+ fd = open(KVM_DEV_PATH, flags);
+ if (fd < 0) {
+ print_skip("%s not available, is KVM loaded? (errno: %d)",
+ KVM_DEV_PATH, errno);
+ exit(KSFT_SKIP);
+ }
+
+ return fd;
+}
+
+int open_kvm_dev_path_or_exit(void)
+{
+ return _open_kvm_dev_path_or_exit(O_RDONLY);
+}
+
+/*
* Capability
*
* Input Args:
@@ -52,10 +80,7 @@ int kvm_check_cap(long cap)
int ret;
int kvm_fd;
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
-
+ kvm_fd = open_kvm_dev_path_or_exit();
ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
" rc: %i errno: %i", ret, errno);
@@ -128,9 +153,7 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
static void vm_open(struct kvm_vm *vm, int perm)
{
- vm->kvm_fd = open(KVM_DEV_PATH, perm);
- if (vm->kvm_fd < 0)
- exit(KSFT_SKIP);
+ vm->kvm_fd = _open_kvm_dev_path_or_exit(perm);
if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
print_skip("immediate_exit not available");
@@ -203,7 +226,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
TEST_ASSERT(vm != NULL, "Insufficient Memory");
INIT_LIST_HEAD(&vm->vcpus);
- INIT_LIST_HEAD(&vm->userspace_mem_regions);
+ vm->regions.gpa_tree = RB_ROOT;
+ vm->regions.hva_tree = RB_ROOT;
+ hash_init(vm->regions.slot_hash);
vm->mode = mode;
vm->type = 0;
@@ -295,7 +320,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
*/
uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
- uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages;
+ uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages;
struct kvm_vm *vm;
int i;
@@ -355,13 +380,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
*/
void kvm_vm_restart(struct kvm_vm *vmp, int perm)
{
+ int ctr;
struct userspace_mem_region *region;
vm_open(vmp, perm);
if (vmp->has_irqchip)
vm_create_irqchip(vmp);
- list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
+ hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
" rc: %i errno: %i\n"
@@ -424,14 +450,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
static struct userspace_mem_region *
userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
{
- struct userspace_mem_region *region;
+ struct rb_node *node;
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ for (node = vm->regions.gpa_tree.rb_node; node; ) {
+ struct userspace_mem_region *region =
+ container_of(node, struct userspace_mem_region, gpa_node);
uint64_t existing_start = region->region.guest_phys_addr;
uint64_t existing_end = region->region.guest_phys_addr
+ region->region.memory_size - 1;
if (start <= existing_end && end >= existing_start)
return region;
+
+ if (start < existing_start)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
}
return NULL;
@@ -546,11 +579,16 @@ void kvm_vm_release(struct kvm_vm *vmp)
}
static void __vm_mem_region_delete(struct kvm_vm *vm,
- struct userspace_mem_region *region)
+ struct userspace_mem_region *region,
+ bool unlink)
{
int ret;
- list_del(&region->list);
+ if (unlink) {
+ rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
+ rb_erase(&region->hva_node, &vm->regions.hva_tree);
+ hash_del(&region->slot_node);
+ }
region->region.memory_size = 0;
ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
@@ -569,14 +607,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
*/
void kvm_vm_free(struct kvm_vm *vmp)
{
- struct userspace_mem_region *region, *tmp;
+ int ctr;
+ struct hlist_node *node;
+ struct userspace_mem_region *region;
if (vmp == NULL)
return;
/* Free userspace_mem_regions. */
- list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
- __vm_mem_region_delete(vmp, region);
+ hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
+ __vm_mem_region_delete(vmp, region, false);
/* Free sparsebit arrays. */
sparsebit_free(&vmp->vpages_valid);
@@ -658,13 +698,64 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
return 0;
}
+static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
+ struct userspace_mem_region *region)
+{
+ struct rb_node **cur, *parent;
+
+ for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
+ struct userspace_mem_region *cregion;
+
+ cregion = container_of(*cur, typeof(*cregion), gpa_node);
+ parent = *cur;
+ if (region->region.guest_phys_addr <
+ cregion->region.guest_phys_addr)
+ cur = &(*cur)->rb_left;
+ else {
+ TEST_ASSERT(region->region.guest_phys_addr !=
+ cregion->region.guest_phys_addr,
+ "Duplicate GPA in region tree");
+
+ cur = &(*cur)->rb_right;
+ }
+ }
+
+ rb_link_node(&region->gpa_node, parent, cur);
+ rb_insert_color(&region->gpa_node, gpa_tree);
+}
+
+static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
+ struct userspace_mem_region *region)
+{
+ struct rb_node **cur, *parent;
+
+ for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
+ struct userspace_mem_region *cregion;
+
+ cregion = container_of(*cur, typeof(*cregion), hva_node);
+ parent = *cur;
+ if (region->host_mem < cregion->host_mem)
+ cur = &(*cur)->rb_left;
+ else {
+ TEST_ASSERT(region->host_mem !=
+ cregion->host_mem,
+ "Duplicate HVA in region tree");
+
+ cur = &(*cur)->rb_right;
+ }
+ }
+
+ rb_link_node(&region->hva_node, parent, cur);
+ rb_insert_color(&region->hva_node, hva_tree);
+}
+
/*
* VM Userspace Memory Region Add
*
* Input Args:
* vm - Virtual Machine
- * backing_src - Storage source for this region.
- * NULL to use anonymous memory.
+ * src_type - Storage source for this region.
+ * NULL to use anonymous memory.
* guest_paddr - Starting guest physical address
* slot - KVM region slot
* npages - Number of physical pages
@@ -722,7 +813,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
(uint64_t) region->region.memory_size);
/* Confirm no region with the requested slot already exists. */
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
+ slot) {
if (region->region.slot != slot)
continue;
@@ -755,11 +847,30 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
if (alignment > 1)
region->mmap_size += alignment;
+ region->fd = -1;
+ if (backing_src_is_shared(src_type)) {
+ int memfd_flags = MFD_CLOEXEC;
+
+ if (src_type == VM_MEM_SRC_SHARED_HUGETLB)
+ memfd_flags |= MFD_HUGETLB;
+
+ region->fd = memfd_create("kvm_selftest", memfd_flags);
+ TEST_ASSERT(region->fd != -1,
+ "memfd_create failed, errno: %i", errno);
+
+ ret = ftruncate(region->fd, region->mmap_size);
+ TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno);
+
+ ret = fallocate(region->fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+ region->mmap_size);
+ TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno);
+ }
+
region->mmap_start = mmap(NULL, region->mmap_size,
PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS
- | vm_mem_backing_src_alias(src_type)->flag,
- -1, 0);
+ vm_mem_backing_src_alias(src_type)->flag,
+ region->fd, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
region->mmap_start, errno);
@@ -793,8 +904,23 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
ret, errno, slot, flags,
guest_paddr, (uint64_t) region->region.memory_size);
- /* Add to linked-list of memory regions. */
- list_add(&region->list, &vm->userspace_mem_regions);
+ /* Add to quick lookup data structures */
+ vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
+ vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
+ hash_add(vm->regions.slot_hash, &region->slot_node, slot);
+
+ /* If shared memory, create an alias. */
+ if (region->fd >= 0) {
+ region->mmap_alias = mmap(NULL, region->mmap_size,
+ PROT_READ | PROT_WRITE,
+ vm_mem_backing_src_alias(src_type)->flag,
+ region->fd, 0);
+ TEST_ASSERT(region->mmap_alias != MAP_FAILED,
+ "mmap of alias failed, errno: %i", errno);
+
+ /* Align host alias address */
+ region->host_alias = align(region->mmap_alias, alignment);
+ }
}
/*
@@ -817,10 +943,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot)
{
struct userspace_mem_region *region;
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
+ memslot)
if (region->region.slot == memslot)
return region;
- }
fprintf(stderr, "No mem region with the requested slot found,\n"
" requested slot: %u\n", memslot);
@@ -905,7 +1031,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
*/
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
{
- __vm_mem_region_delete(vm, memslot2region(vm, slot));
+ __vm_mem_region_delete(vm, memslot2region(vm, slot), true);
}
/*
@@ -925,9 +1051,7 @@ static int vcpu_mmap_sz(void)
{
int dev_fd, ret;
- dev_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (dev_fd < 0)
- exit(KSFT_SKIP);
+ dev_fd = open_kvm_dev_path_or_exit();
ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
TEST_ASSERT(ret >= sizeof(struct kvm_run),
@@ -1099,6 +1223,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
virt_pgd_alloc(vm, pgd_memslot);
+ vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
+ KVM_UTIL_MIN_PFN * vm->page_size,
+ data_memslot);
/*
* Find an unused range of virtual page addresses of at least
@@ -1108,11 +1235,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
/* Map the virtual pages. */
for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
- pages--, vaddr += vm->page_size) {
- vm_paddr_t paddr;
-
- paddr = vm_phy_page_alloc(vm,
- KVM_UTIL_MIN_PFN * vm->page_size, data_memslot);
+ pages--, vaddr += vm->page_size, paddr += vm->page_size) {
virt_pg_map(vm, vaddr, paddr, pgd_memslot);
@@ -1177,16 +1300,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
{
struct userspace_mem_region *region;
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
- if ((gpa >= region->region.guest_phys_addr)
- && (gpa <= (region->region.guest_phys_addr
- + region->region.memory_size - 1)))
- return (void *) ((uintptr_t) region->host_mem
- + (gpa - region->region.guest_phys_addr));
+ region = userspace_mem_region_find(vm, gpa, gpa);
+ if (!region) {
+ TEST_FAIL("No vm physical memory at 0x%lx", gpa);
+ return NULL;
}
- TEST_FAIL("No vm physical memory at 0x%lx", gpa);
- return NULL;
+ return (void *)((uintptr_t)region->host_mem
+ + (gpa - region->region.guest_phys_addr));
}
/*
@@ -1208,15 +1329,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
*/
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
{
- struct userspace_mem_region *region;
+ struct rb_node *node;
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
- if ((hva >= region->host_mem)
- && (hva <= (region->host_mem
- + region->region.memory_size - 1)))
- return (vm_paddr_t) ((uintptr_t)
- region->region.guest_phys_addr
- + (hva - (uintptr_t) region->host_mem));
+ for (node = vm->regions.hva_tree.rb_node; node; ) {
+ struct userspace_mem_region *region =
+ container_of(node, struct userspace_mem_region, hva_node);
+
+ if (hva >= region->host_mem) {
+ if (hva <= (region->host_mem
+ + region->region.memory_size - 1))
+ return (vm_paddr_t)((uintptr_t)
+ region->region.guest_phys_addr
+ + (hva - (uintptr_t)region->host_mem));
+
+ node = node->rb_right;
+ } else
+ node = node->rb_left;
}
TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
@@ -1224,6 +1352,42 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
}
/*
+ * Address VM physical to Host Virtual *alias*.
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gpa - VM physical address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent address within the host virtual *alias* area, or NULL
+ * (without failing the test) if the guest memory is not shared (so
+ * no alias exists).
+ *
+ * When vm_create() and related functions are called with a shared memory
+ * src_type, we also create a writable, shared alias mapping of the
+ * underlying guest memory. This allows the host to manipulate guest memory
+ * without mapping that memory in the guest's address space. And, for
+ * userfaultfd-based demand paging, we can do so without triggering userfaults.
+ */
+void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ struct userspace_mem_region *region;
+ uintptr_t offset;
+
+ region = userspace_mem_region_find(vm, gpa, gpa);
+ if (!region)
+ return NULL;
+
+ if (!region->host_alias)
+ return NULL;
+
+ offset = gpa - region->region.guest_phys_addr;
+ return (void *) ((uintptr_t) region->host_alias + offset);
+}
+
+/*
* VM Create IRQ Chip
*
* Input Args:
@@ -1822,6 +1986,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
*/
void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
{
+ int ctr;
struct userspace_mem_region *region;
struct vcpu *vcpu;
@@ -1829,7 +1994,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
fprintf(stream, "%*sMem Regions:\n", indent, "");
- list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
"host_virt: %p\n", indent + 2, "",
(uint64_t) region->region.guest_phys_addr,
@@ -2015,10 +2180,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm)
if (vm == NULL) {
/* Ensure that the KVM vendor-specific module is loaded. */
- f = fopen(KVM_DEV_PATH, "r");
- TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d",
- errno);
- fclose(f);
+ close(open_kvm_dev_path_or_exit());
}
f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r");
@@ -2041,7 +2203,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm)
return vm->page_shift;
}
-unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+uint64_t vm_get_max_gfn(struct kvm_vm *vm)
{
return vm->max_gfn;
}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index 91ce1b5d480b..a03febc24ba6 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -8,6 +8,9 @@
#ifndef SELFTEST_KVM_UTIL_INTERNAL_H
#define SELFTEST_KVM_UTIL_INTERNAL_H
+#include "linux/hashtable.h"
+#include "linux/rbtree.h"
+
#include "sparsebit.h"
struct userspace_mem_region {
@@ -16,9 +19,13 @@ struct userspace_mem_region {
int fd;
off_t offset;
void *host_mem;
+ void *host_alias;
void *mmap_start;
+ void *mmap_alias;
size_t mmap_size;
- struct list_head list;
+ struct rb_node gpa_node;
+ struct rb_node hva_node;
+ struct hlist_node slot_node;
};
struct vcpu {
@@ -31,6 +38,12 @@ struct vcpu {
uint32_t dirty_gfns_count;
};
+struct userspace_mem_regions {
+ struct rb_root gpa_tree;
+ struct rb_root hva_tree;
+ DECLARE_HASHTABLE(slot_hash, 9);
+};
+
struct kvm_vm {
int mode;
unsigned long type;
@@ -43,7 +56,7 @@ struct kvm_vm {
unsigned int va_bits;
uint64_t max_gfn;
struct list_head vcpus;
- struct list_head userspace_mem_regions;
+ struct userspace_mem_regions regions;
struct sparsebit *vpages_valid;
struct sparsebit *vpages_mapped;
bool has_irqchip;
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 81490b9b4e32..abf381800a59 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2020, Google LLC.
*/
+#include <inttypes.h>
#include "kvm_util.h"
#include "perf_test_util.h"
@@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
*/
TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
"Requested more guest memory than address space allows.\n"
- " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+ " guest pages: %" PRIx64 " max gfn: %" PRIx64
+ " vcpus: %d wss: %" PRIx64 "]\n",
guest_num_pages, vm_get_max_gfn(vm), vcpus,
vcpu_memory_bytes);
diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c
new file mode 100644
index 000000000000..a703f0194ea3
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/rbtree.c
@@ -0,0 +1 @@
+#include "../../../../lib/rbtree.c"
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 63d2bc7d757b..6ad6c8276b2e 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -168,70 +168,87 @@ size_t get_def_hugetlb_pagesz(void)
const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
{
+ static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+ static const int anon_huge_flags = anon_flags | MAP_HUGETLB;
+
static const struct vm_mem_backing_src_alias aliases[] = {
[VM_MEM_SRC_ANONYMOUS] = {
.name = "anonymous",
- .flag = 0,
+ .flag = anon_flags,
},
[VM_MEM_SRC_ANONYMOUS_THP] = {
.name = "anonymous_thp",
- .flag = 0,
+ .flag = anon_flags,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
.name = "anonymous_hugetlb",
- .flag = MAP_HUGETLB,
+ .flag = anon_huge_flags,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
.name = "anonymous_hugetlb_16kb",
- .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+ .flag = anon_huge_flags | MAP_HUGE_16KB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
.name = "anonymous_hugetlb_64kb",
- .flag = MAP_HUGETLB | MAP_HUGE_64KB,
+ .flag = anon_huge_flags | MAP_HUGE_64KB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
.name = "anonymous_hugetlb_512kb",
- .flag = MAP_HUGETLB | MAP_HUGE_512KB,
+ .flag = anon_huge_flags | MAP_HUGE_512KB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
.name = "anonymous_hugetlb_1mb",
- .flag = MAP_HUGETLB | MAP_HUGE_1MB,
+ .flag = anon_huge_flags | MAP_HUGE_1MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
.name = "anonymous_hugetlb_2mb",
- .flag = MAP_HUGETLB | MAP_HUGE_2MB,
+ .flag = anon_huge_flags | MAP_HUGE_2MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
.name = "anonymous_hugetlb_8mb",
- .flag = MAP_HUGETLB | MAP_HUGE_8MB,
+ .flag = anon_huge_flags | MAP_HUGE_8MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
.name = "anonymous_hugetlb_16mb",
- .flag = MAP_HUGETLB | MAP_HUGE_16MB,
+ .flag = anon_huge_flags | MAP_HUGE_16MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
.name = "anonymous_hugetlb_32mb",
- .flag = MAP_HUGETLB | MAP_HUGE_32MB,
+ .flag = anon_huge_flags | MAP_HUGE_32MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
.name = "anonymous_hugetlb_256mb",
- .flag = MAP_HUGETLB | MAP_HUGE_256MB,
+ .flag = anon_huge_flags | MAP_HUGE_256MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
.name = "anonymous_hugetlb_512mb",
- .flag = MAP_HUGETLB | MAP_HUGE_512MB,
+ .flag = anon_huge_flags | MAP_HUGE_512MB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
.name = "anonymous_hugetlb_1gb",
- .flag = MAP_HUGETLB | MAP_HUGE_1GB,
+ .flag = anon_huge_flags | MAP_HUGE_1GB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
.name = "anonymous_hugetlb_2gb",
- .flag = MAP_HUGETLB | MAP_HUGE_2GB,
+ .flag = anon_huge_flags | MAP_HUGE_2GB,
},
[VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
.name = "anonymous_hugetlb_16gb",
- .flag = MAP_HUGETLB | MAP_HUGE_16GB,
+ .flag = anon_huge_flags | MAP_HUGE_16GB,
+ },
+ [VM_MEM_SRC_SHMEM] = {
+ .name = "shmem",
+ .flag = MAP_SHARED,
+ },
+ [VM_MEM_SRC_SHARED_HUGETLB] = {
+ .name = "shared_hugetlb",
+ /*
+ * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since
+ * we're using "file backed" memory, we need to specify
+ * this when the FD is created, not when the area is
+ * mapped.
+ */
+ .flag = MAP_SHARED,
},
};
_Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
@@ -250,10 +267,12 @@ size_t get_backing_src_pagesz(uint32_t i)
switch (i) {
case VM_MEM_SRC_ANONYMOUS:
+ case VM_MEM_SRC_SHMEM:
return getpagesize();
case VM_MEM_SRC_ANONYMOUS_THP:
return get_trans_hugepagesz();
case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+ case VM_MEM_SRC_SHARED_HUGETLB:
return get_def_hugetlb_pagesz();
default:
return MAP_HUGE_PAGE_SIZE(flag);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index a8906e60a108..efe235044421 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -657,9 +657,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
return cpuid;
cpuid = allocate_kvm_cpuid2();
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
@@ -691,9 +689,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index)
buffer.header.nmsrs = 1;
buffer.entry.index = msr_index;
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
@@ -986,9 +982,7 @@ struct kvm_msr_list *kvm_get_msr_index_list(void)
struct kvm_msr_list *list;
int nmsrs, r, kvm_fd;
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
nmsrs = kvm_get_num_msrs_fd(kvm_fd);
list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
@@ -1312,9 +1306,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
return cpuid;
cpuid = allocate_kvm_cpuid2();
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 6096bf0a5b34..98351ba0933c 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -71,14 +71,22 @@ struct memslot_antagonist_args {
};
static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
- uint64_t nr_modifications, uint64_t gpa)
+ uint64_t nr_modifications)
{
+ const uint64_t pages = 1;
+ uint64_t gpa;
int i;
+ /*
+ * Add the dummy memslot just below the perf_test_util memslot, which is
+ * at the top of the guest physical address space.
+ */
+ gpa = guest_test_phys_mem - pages * vm_get_page_size(vm);
+
for (i = 0; i < nr_modifications; i++) {
usleep(delay);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa,
- DUMMY_MEMSLOT_INDEX, 1, 0);
+ DUMMY_MEMSLOT_INDEX, pages, 0);
vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX);
}
@@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Started all vCPUs\n");
add_remove_memslot(vm, p->memslot_modification_delay,
- p->nr_memslot_modifications,
- guest_test_phys_mem +
- (guest_percpu_mem_size * nr_vcpus) +
- perf_test_args.host_page_size +
- perf_test_args.guest_page_size);
+ p->nr_memslot_modifications);
run_vcpus = false;
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
new file mode 100644
index 000000000000..9307f25d8130
--- /dev/null
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -0,0 +1,1037 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A memslot-related performance benchmark.
+ *
+ * Copyright (C) 2021 Oracle and/or its affiliates.
+ *
+ * Basic guest setup / host vCPU thread code lifted from set_memory_region_test.
+ */
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+#define MEM_SIZE ((512U << 20) + 4096)
+#define MEM_SIZE_PAGES (MEM_SIZE / 4096)
+#define MEM_GPA 0x10000000UL
+#define MEM_AUX_GPA MEM_GPA
+#define MEM_SYNC_GPA MEM_AUX_GPA
+#define MEM_TEST_GPA (MEM_AUX_GPA + 4096)
+#define MEM_TEST_SIZE (MEM_SIZE - 4096)
+static_assert(MEM_SIZE % 4096 == 0, "invalid mem size");
+static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size");
+
+/*
+ * 32 MiB is max size that gets well over 100 iterations on 509 slots.
+ * Considering that each slot needs to have at least one page up to
+ * 8194 slots in use can then be tested (although with slightly
+ * limited resolution).
+ */
+#define MEM_SIZE_MAP ((32U << 20) + 4096)
+#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096)
+#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096)
+#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096)
+static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size");
+
+/*
+ * 128 MiB is min size that fills 32k slots with at least one page in each
+ * while at the same time gets 100+ iterations in such test
+ */
+#define MEM_TEST_UNMAP_SIZE (128U << 20)
+#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096)
+/* 2 MiB chunk size like a typical huge page */
+#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12))
+static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE,
+ "invalid unmap test region size");
+static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0,
+ "invalid unmap test region size");
+static_assert(MEM_TEST_UNMAP_SIZE_PAGES %
+ (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0,
+ "invalid unmap test region size");
+
+/*
+ * For the move active test the middle of the test area is placed on
+ * a memslot boundary: half lies in the memslot being moved, half in
+ * other memslot(s).
+ *
+ * When running this test with 32k memslots (32764, really) each memslot
+ * contains 4 pages.
+ * The last one additionally contains the remaining 21 pages of memory,
+ * for the total size of 25 pages.
+ * Hence, the maximum size here is 50 pages.
+ */
+#define MEM_TEST_MOVE_SIZE_PAGES (50)
+#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096)
+#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE)
+static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE,
+ "invalid move test region size");
+
+#define MEM_TEST_VAL_1 0x1122334455667788
+#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00
+
+struct vm_data {
+ struct kvm_vm *vm;
+ pthread_t vcpu_thread;
+ uint32_t nslots;
+ uint64_t npages;
+ uint64_t pages_per_slot;
+ void **hva_slots;
+ bool mmio_ok;
+ uint64_t mmio_gpa_min;
+ uint64_t mmio_gpa_max;
+};
+
+struct sync_area {
+ atomic_bool start_flag;
+ atomic_bool exit_flag;
+ atomic_bool sync_flag;
+ void *move_area_ptr;
+};
+
+/*
+ * Technically, we need also for the atomic bool to be address-free, which
+ * is recommended, but not strictly required, by C11 for lockless
+ * implementations.
+ * However, in practice both GCC and Clang fulfill this requirement on
+ * all KVM-supported platforms.
+ */
+static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
+
+static sem_t vcpu_ready;
+
+static bool map_unmap_verify;
+
+static bool verbose;
+#define pr_info_v(...) \
+ do { \
+ if (verbose) \
+ pr_info(__VA_ARGS__); \
+ } while (0)
+
+static void *vcpu_worker(void *data)
+{
+ struct vm_data *vm = data;
+ struct kvm_run *run;
+ struct ucall uc;
+ uint64_t cmd;
+
+ run = vcpu_state(vm->vm, VCPU_ID);
+ while (1) {
+ vcpu_run(vm->vm, VCPU_ID);
+
+ if (run->exit_reason == KVM_EXIT_IO) {
+ cmd = get_ucall(vm->vm, VCPU_ID, &uc);
+ if (cmd != UCALL_SYNC)
+ break;
+
+ sem_post(&vcpu_ready);
+ continue;
+ }
+
+ if (run->exit_reason != KVM_EXIT_MMIO)
+ break;
+
+ TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit");
+ TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read");
+ TEST_ASSERT(run->mmio.len == 8,
+ "Unexpected exit mmio size = %u", run->mmio.len);
+ TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min &&
+ run->mmio.phys_addr <= vm->mmio_gpa_max,
+ "Unexpected exit mmio address = 0x%llx",
+ run->mmio.phys_addr);
+ }
+
+ if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+ TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2]);
+
+ return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+ struct timespec ts;
+
+ TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+ "clock_gettime() failed: %d\n", errno);
+
+ ts.tv_sec += 2;
+ TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+ "sem_timedwait() failed: %d\n", errno);
+}
+
+static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages)
+{
+ uint64_t gpage, pgoffs;
+ uint32_t slot, slotoffs;
+ void *base;
+
+ TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate");
+ TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096,
+ "Too high gpa to translate");
+ gpa -= MEM_GPA;
+
+ gpage = gpa / 4096;
+ pgoffs = gpa % 4096;
+ slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1);
+ slotoffs = gpage - (slot * data->pages_per_slot);
+
+ if (rempages) {
+ uint64_t slotpages;
+
+ if (slot == data->nslots - 1)
+ slotpages = data->npages - slot * data->pages_per_slot;
+ else
+ slotpages = data->pages_per_slot;
+
+ TEST_ASSERT(!pgoffs,
+ "Asking for remaining pages in slot but gpa not page aligned");
+ *rempages = slotpages - slotoffs;
+ }
+
+ base = data->hva_slots[slot];
+ return (uint8_t *)base + slotoffs * 4096 + pgoffs;
+}
+
+static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot)
+{
+ TEST_ASSERT(slot < data->nslots, "Too high slot number");
+
+ return MEM_GPA + slot * data->pages_per_slot * 4096;
+}
+
+static struct vm_data *alloc_vm(void)
+{
+ struct vm_data *data;
+
+ data = malloc(sizeof(*data));
+ TEST_ASSERT(data, "malloc(vmdata) failed");
+
+ data->vm = NULL;
+ data->hva_slots = NULL;
+
+ return data;
+}
+
+static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
+ void *guest_code, uint64_t mempages,
+ struct timespec *slot_runtime)
+{
+ uint32_t max_mem_slots;
+ uint64_t rempages;
+ uint64_t guest_addr;
+ uint32_t slot;
+ struct timespec tstart;
+ struct sync_area *sync;
+
+ max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+ TEST_ASSERT(max_mem_slots > 1,
+ "KVM_CAP_NR_MEMSLOTS should be greater than 1");
+ TEST_ASSERT(nslots > 1 || nslots == -1,
+ "Slot count cap should be greater than 1");
+ if (nslots != -1)
+ max_mem_slots = min(max_mem_slots, (uint32_t)nslots);
+ pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots);
+
+ TEST_ASSERT(mempages > 1,
+ "Can't test without any memory");
+
+ data->npages = mempages;
+ data->nslots = max_mem_slots - 1;
+ data->pages_per_slot = mempages / data->nslots;
+ if (!data->pages_per_slot) {
+ *maxslots = mempages + 1;
+ return false;
+ }
+
+ rempages = mempages % data->nslots;
+ data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
+ TEST_ASSERT(data->hva_slots, "malloc() fail");
+
+ data->vm = vm_create_default(VCPU_ID, 1024, guest_code);
+
+ pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
+ max_mem_slots - 1, data->pages_per_slot, rempages);
+
+ clock_gettime(CLOCK_MONOTONIC, &tstart);
+ for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) {
+ uint64_t npages;
+
+ npages = data->pages_per_slot;
+ if (slot == max_mem_slots - 1)
+ npages += rempages;
+
+ vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS,
+ guest_addr, slot, npages,
+ 0);
+ guest_addr += npages * 4096;
+ }
+ *slot_runtime = timespec_elapsed(tstart);
+
+ for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) {
+ uint64_t npages;
+ uint64_t gpa;
+
+ npages = data->pages_per_slot;
+ if (slot == max_mem_slots - 2)
+ npages += rempages;
+
+ gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr,
+ slot + 1);
+ TEST_ASSERT(gpa == guest_addr,
+ "vm_phy_pages_alloc() failed\n");
+
+ data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr);
+ memset(data->hva_slots[slot], 0, npages * 4096);
+
+ guest_addr += npages * 4096;
+ }
+
+ virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0);
+
+ sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+ atomic_init(&sync->start_flag, false);
+ atomic_init(&sync->exit_flag, false);
+ atomic_init(&sync->sync_flag, false);
+
+ data->mmio_ok = false;
+
+ return true;
+}
+
+static void launch_vm(struct vm_data *data)
+{
+ pr_info_v("Launching the test VM\n");
+
+ pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data);
+
+ /* Ensure the guest thread is spun up. */
+ wait_for_vcpu();
+}
+
+static void free_vm(struct vm_data *data)
+{
+ kvm_vm_free(data->vm);
+ free(data->hva_slots);
+ free(data);
+}
+
+static void wait_guest_exit(struct vm_data *data)
+{
+ pthread_join(data->vcpu_thread, NULL);
+}
+
+static void let_guest_run(struct sync_area *sync)
+{
+ atomic_store_explicit(&sync->start_flag, true, memory_order_release);
+}
+
+static void guest_spin_until_start(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+ while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire))
+ ;
+}
+
+static void make_guest_exit(struct sync_area *sync)
+{
+ atomic_store_explicit(&sync->exit_flag, true, memory_order_release);
+}
+
+static bool _guest_should_exit(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+ return atomic_load_explicit(&sync->exit_flag, memory_order_acquire);
+}
+
+#define guest_should_exit() unlikely(_guest_should_exit())
+
+/*
+ * noinline so we can easily see how much time the host spends waiting
+ * for the guest.
+ * For the same reason use alarm() instead of polling clock_gettime()
+ * to implement a wait timeout.
+ */
+static noinline void host_perform_sync(struct sync_area *sync)
+{
+ alarm(2);
+
+ atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
+ while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))
+ ;
+
+ alarm(0);
+}
+
+static bool guest_perform_sync(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+ bool expected;
+
+ do {
+ if (guest_should_exit())
+ return false;
+
+ expected = true;
+ } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag,
+ &expected, false,
+ memory_order_acq_rel,
+ memory_order_relaxed));
+
+ return true;
+}
+
+static void guest_code_test_memslot_move(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+ uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr);
+
+ GUEST_SYNC(0);
+
+ guest_spin_until_start();
+
+ while (!guest_should_exit()) {
+ uintptr_t ptr;
+
+ for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE;
+ ptr += 4096)
+ *(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+ /*
+ * No host sync here since the MMIO exits are so expensive
+ * that the host would spend most of its time waiting for
+ * the guest and so instead of measuring memslot move
+ * performance we would measure the performance and
+ * likelihood of MMIO exits
+ */
+ }
+
+ GUEST_DONE();
+}
+
+static void guest_code_test_memslot_map(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+ GUEST_SYNC(0);
+
+ guest_spin_until_start();
+
+ while (1) {
+ uintptr_t ptr;
+
+ for (ptr = MEM_TEST_GPA;
+ ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096)
+ *(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+ if (!guest_perform_sync())
+ break;
+
+ for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2;
+ ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096)
+ *(uint64_t *)ptr = MEM_TEST_VAL_2;
+
+ if (!guest_perform_sync())
+ break;
+ }
+
+ GUEST_DONE();
+}
+
+static void guest_code_test_memslot_unmap(void)
+{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+ GUEST_SYNC(0);
+
+ guest_spin_until_start();
+
+ while (1) {
+ uintptr_t ptr = MEM_TEST_GPA;
+
+ /*
+ * We can afford to access (map) just a small number of pages
+ * per host sync as otherwise the host will spend
+ * a significant amount of its time waiting for the guest
+ * (instead of doing unmap operations), so this will
+ * effectively turn this test into a map performance test.
+ *
+ * Just access a single page to be on the safe side.
+ */
+ *(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+ if (!guest_perform_sync())
+ break;
+
+ ptr += MEM_TEST_UNMAP_SIZE / 2;
+ *(uint64_t *)ptr = MEM_TEST_VAL_2;
+
+ if (!guest_perform_sync())
+ break;
+ }
+
+ GUEST_DONE();
+}
+
+static void guest_code_test_memslot_rw(void)
+{
+ GUEST_SYNC(0);
+
+ guest_spin_until_start();
+
+ while (1) {
+ uintptr_t ptr;
+
+ for (ptr = MEM_TEST_GPA;
+ ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096)
+ *(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+ if (!guest_perform_sync())
+ break;
+
+ for (ptr = MEM_TEST_GPA + 4096 / 2;
+ ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) {
+ uint64_t val = *(uint64_t *)ptr;
+
+ GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val);
+ *(uint64_t *)ptr = 0;
+ }
+
+ if (!guest_perform_sync())
+ break;
+ }
+
+ GUEST_DONE();
+}
+
+static bool test_memslot_move_prepare(struct vm_data *data,
+ struct sync_area *sync,
+ uint64_t *maxslots, bool isactive)
+{
+ uint64_t movesrcgpa, movetestgpa;
+
+ movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
+
+ if (isactive) {
+ uint64_t lastpages;
+
+ vm_gpa2hva(data, movesrcgpa, &lastpages);
+ if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) {
+ *maxslots = 0;
+ return false;
+ }
+ }
+
+ movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1));
+ sync->move_area_ptr = (void *)movetestgpa;
+
+ if (isactive) {
+ data->mmio_ok = true;
+ data->mmio_gpa_min = movesrcgpa;
+ data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1;
+ }
+
+ return true;
+}
+
+static bool test_memslot_move_prepare_active(struct vm_data *data,
+ struct sync_area *sync,
+ uint64_t *maxslots)
+{
+ return test_memslot_move_prepare(data, sync, maxslots, true);
+}
+
+static bool test_memslot_move_prepare_inactive(struct vm_data *data,
+ struct sync_area *sync,
+ uint64_t *maxslots)
+{
+ return test_memslot_move_prepare(data, sync, maxslots, false);
+}
+
+static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync)
+{
+ uint64_t movesrcgpa;
+
+ movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
+ vm_mem_region_move(data->vm, data->nslots - 1 + 1,
+ MEM_TEST_MOVE_GPA_DEST);
+ vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa);
+}
+
+static void test_memslot_do_unmap(struct vm_data *data,
+ uint64_t offsp, uint64_t count)
+{
+ uint64_t gpa, ctr;
+
+ for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) {
+ uint64_t npages;
+ void *hva;
+ int ret;
+
+ hva = vm_gpa2hva(data, gpa, &npages);
+ TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa);
+ npages = min(npages, count - ctr);
+ ret = madvise(hva, npages * 4096, MADV_DONTNEED);
+ TEST_ASSERT(!ret,
+ "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64,
+ hva, gpa);
+ ctr += npages;
+ gpa += npages * 4096;
+ }
+ TEST_ASSERT(ctr == count,
+ "madvise(MADV_DONTNEED) should exactly cover all of the requested area");
+}
+
+static void test_memslot_map_unmap_check(struct vm_data *data,
+ uint64_t offsp, uint64_t valexp)
+{
+ uint64_t gpa;
+ uint64_t *val;
+
+ if (!map_unmap_verify)
+ return;
+
+ gpa = MEM_TEST_GPA + offsp * 4096;
+ val = (typeof(val))vm_gpa2hva(data, gpa, NULL);
+ TEST_ASSERT(*val == valexp,
+ "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")",
+ *val, valexp, gpa);
+ *val = 0;
+}
+
+static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync)
+{
+ /*
+ * Unmap the second half of the test area while guest writes to (maps)
+ * the first half.
+ */
+ test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2,
+ MEM_TEST_MAP_SIZE_PAGES / 2);
+
+ /*
+ * Wait for the guest to finish writing the first half of the test
+ * area, verify the written value on the first and the last page of
+ * this area and then unmap it.
+ * Meanwhile, the guest is writing to (mapping) the second half of
+ * the test area.
+ */
+ host_perform_sync(sync);
+ test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
+ test_memslot_map_unmap_check(data,
+ MEM_TEST_MAP_SIZE_PAGES / 2 - 1,
+ MEM_TEST_VAL_1);
+ test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2);
+
+
+ /*
+ * Wait for the guest to finish writing the second half of the test
+ * area and verify the written value on the first and the last page
+ * of this area.
+ * The area will be unmapped at the beginning of the next loop
+ * iteration.
+ * Meanwhile, the guest is writing to (mapping) the first half of
+ * the test area.
+ */
+ host_perform_sync(sync);
+ test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2,
+ MEM_TEST_VAL_2);
+ test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1,
+ MEM_TEST_VAL_2);
+}
+
+static void test_memslot_unmap_loop_common(struct vm_data *data,
+ struct sync_area *sync,
+ uint64_t chunk)
+{
+ uint64_t ctr;
+
+ /*
+ * Wait for the guest to finish mapping page(s) in the first half
+ * of the test area, verify the written value and then perform unmap
+ * of this area.
+ * Meanwhile, the guest is writing to (mapping) page(s) in the second
+ * half of the test area.
+ */
+ host_perform_sync(sync);
+ test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
+ for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk)
+ test_memslot_do_unmap(data, ctr, chunk);
+
+ /* Likewise, but for the opposite host / guest areas */
+ host_perform_sync(sync);
+ test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2,
+ MEM_TEST_VAL_2);
+ for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2;
+ ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk)
+ test_memslot_do_unmap(data, ctr, chunk);
+}
+
+static void test_memslot_unmap_loop(struct vm_data *data,
+ struct sync_area *sync)
+{
+ test_memslot_unmap_loop_common(data, sync, 1);
+}
+
+static void test_memslot_unmap_loop_chunked(struct vm_data *data,
+ struct sync_area *sync)
+{
+ test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES);
+}
+
+static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync)
+{
+ uint64_t gptr;
+
+ for (gptr = MEM_TEST_GPA + 4096 / 2;
+ gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096)
+ *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2;
+
+ host_perform_sync(sync);
+
+ for (gptr = MEM_TEST_GPA;
+ gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) {
+ uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL);
+ uint64_t val = *vptr;
+
+ TEST_ASSERT(val == MEM_TEST_VAL_1,
+ "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")",
+ val, gptr);
+ *vptr = 0;
+ }
+
+ host_perform_sync(sync);
+}
+
+struct test_data {
+ const char *name;
+ uint64_t mem_size;
+ void (*guest_code)(void);
+ bool (*prepare)(struct vm_data *data, struct sync_area *sync,
+ uint64_t *maxslots);
+ void (*loop)(struct vm_data *data, struct sync_area *sync);
+};
+
+static bool test_execute(int nslots, uint64_t *maxslots,
+ unsigned int maxtime,
+ const struct test_data *tdata,
+ uint64_t *nloops,
+ struct timespec *slot_runtime,
+ struct timespec *guest_runtime)
+{
+ uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES;
+ struct vm_data *data;
+ struct sync_area *sync;
+ struct timespec tstart;
+ bool ret = true;
+
+ data = alloc_vm();
+ if (!prepare_vm(data, nslots, maxslots, tdata->guest_code,
+ mem_size, slot_runtime)) {
+ ret = false;
+ goto exit_free;
+ }
+
+ sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+
+ if (tdata->prepare &&
+ !tdata->prepare(data, sync, maxslots)) {
+ ret = false;
+ goto exit_free;
+ }
+
+ launch_vm(data);
+
+ clock_gettime(CLOCK_MONOTONIC, &tstart);
+ let_guest_run(sync);
+
+ while (1) {
+ *guest_runtime = timespec_elapsed(tstart);
+ if (guest_runtime->tv_sec >= maxtime)
+ break;
+
+ tdata->loop(data, sync);
+
+ (*nloops)++;
+ }
+
+ make_guest_exit(sync);
+ wait_guest_exit(data);
+
+exit_free:
+ free_vm(data);
+
+ return ret;
+}
+
+static const struct test_data tests[] = {
+ {
+ .name = "map",
+ .mem_size = MEM_SIZE_MAP_PAGES,
+ .guest_code = guest_code_test_memslot_map,
+ .loop = test_memslot_map_loop,
+ },
+ {
+ .name = "unmap",
+ .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+ .guest_code = guest_code_test_memslot_unmap,
+ .loop = test_memslot_unmap_loop,
+ },
+ {
+ .name = "unmap chunked",
+ .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+ .guest_code = guest_code_test_memslot_unmap,
+ .loop = test_memslot_unmap_loop_chunked,
+ },
+ {
+ .name = "move active area",
+ .guest_code = guest_code_test_memslot_move,
+ .prepare = test_memslot_move_prepare_active,
+ .loop = test_memslot_move_loop,
+ },
+ {
+ .name = "move inactive area",
+ .guest_code = guest_code_test_memslot_move,
+ .prepare = test_memslot_move_prepare_inactive,
+ .loop = test_memslot_move_loop,
+ },
+ {
+ .name = "RW",
+ .guest_code = guest_code_test_memslot_rw,
+ .loop = test_memslot_rw_loop
+ },
+};
+
+#define NTESTS ARRAY_SIZE(tests)
+
+struct test_args {
+ int tfirst;
+ int tlast;
+ int nslots;
+ int seconds;
+ int runs;
+};
+
+static void help(char *name, struct test_args *targs)
+{
+ int ctr;
+
+ pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n",
+ name);
+ pr_info(" -h: print this help screen.\n");
+ pr_info(" -v: enable verbose mode (not for benchmarking).\n");
+ pr_info(" -d: enable extra debug checks.\n");
+ pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
+ targs->nslots);
+ pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
+ targs->tfirst, NTESTS - 1);
+ pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n",
+ targs->tlast, NTESTS - 1);
+ pr_info(" -l: specify the test length in seconds (currently: %i)\n",
+ targs->seconds);
+ pr_info(" -r: specify the number of runs per test (currently: %i)\n",
+ targs->runs);
+
+ pr_info("\nAvailable tests:\n");
+ for (ctr = 0; ctr < NTESTS; ctr++)
+ pr_info("%d: %s\n", ctr, tests[ctr].name);
+}
+
+static bool parse_args(int argc, char *argv[],
+ struct test_args *targs)
+{
+ int opt;
+
+ while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
+ switch (opt) {
+ case 'h':
+ default:
+ help(argv[0], targs);
+ return false;
+ case 'v':
+ verbose = true;
+ break;
+ case 'd':
+ map_unmap_verify = true;
+ break;
+ case 's':
+ targs->nslots = atoi(optarg);
+ if (targs->nslots <= 0 && targs->nslots != -1) {
+ pr_info("Slot count cap has to be positive or -1 for no cap\n");
+ return false;
+ }
+ break;
+ case 'f':
+ targs->tfirst = atoi(optarg);
+ if (targs->tfirst < 0) {
+ pr_info("First test to run has to be non-negative\n");
+ return false;
+ }
+ break;
+ case 'e':
+ targs->tlast = atoi(optarg);
+ if (targs->tlast < 0 || targs->tlast >= NTESTS) {
+ pr_info("Last test to run has to be non-negative and less than %zu\n",
+ NTESTS);
+ return false;
+ }
+ break;
+ case 'l':
+ targs->seconds = atoi(optarg);
+ if (targs->seconds < 0) {
+ pr_info("Test length in seconds has to be non-negative\n");
+ return false;
+ }
+ break;
+ case 'r':
+ targs->runs = atoi(optarg);
+ if (targs->runs <= 0) {
+ pr_info("Runs per test has to be positive\n");
+ return false;
+ }
+ break;
+ }
+ }
+
+ if (optind < argc) {
+ help(argv[0], targs);
+ return false;
+ }
+
+ if (targs->tfirst > targs->tlast) {
+ pr_info("First test to run cannot be greater than the last test to run\n");
+ return false;
+ }
+
+ return true;
+}
+
+struct test_result {
+ struct timespec slot_runtime, guest_runtime, iter_runtime;
+ int64_t slottimens, runtimens;
+ uint64_t nloops;
+};
+
+static bool test_loop(const struct test_data *data,
+ const struct test_args *targs,
+ struct test_result *rbestslottime,
+ struct test_result *rbestruntime)
+{
+ uint64_t maxslots;
+ struct test_result result;
+
+ result.nloops = 0;
+ if (!test_execute(targs->nslots, &maxslots, targs->seconds, data,
+ &result.nloops,
+ &result.slot_runtime, &result.guest_runtime)) {
+ if (maxslots)
+ pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n",
+ maxslots);
+ else
+ pr_info("Memslot count may be too high for this test, try adjusting the cap\n");
+
+ return false;
+ }
+
+ pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n",
+ result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec,
+ result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec);
+ if (!result.nloops) {
+ pr_info("No full loops done - too short test time or system too loaded?\n");
+ return true;
+ }
+
+ result.iter_runtime = timespec_div(result.guest_runtime,
+ result.nloops);
+ pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n",
+ result.nloops,
+ result.iter_runtime.tv_sec,
+ result.iter_runtime.tv_nsec);
+ result.slottimens = timespec_to_ns(result.slot_runtime);
+ result.runtimens = timespec_to_ns(result.iter_runtime);
+
+ /*
+ * Only rank the slot setup time for tests using the whole test memory
+ * area so they are comparable
+ */
+ if (!data->mem_size &&
+ (!rbestslottime->slottimens ||
+ result.slottimens < rbestslottime->slottimens))
+ *rbestslottime = result;
+ if (!rbestruntime->runtimens ||
+ result.runtimens < rbestruntime->runtimens)
+ *rbestruntime = result;
+
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ struct test_args targs = {
+ .tfirst = 0,
+ .tlast = NTESTS - 1,
+ .nslots = -1,
+ .seconds = 5,
+ .runs = 1,
+ };
+ struct test_result rbestslottime;
+ int tctr;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ if (!parse_args(argc, argv, &targs))
+ return -1;
+
+ rbestslottime.slottimens = 0;
+ for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) {
+ const struct test_data *data = &tests[tctr];
+ unsigned int runctr;
+ struct test_result rbestruntime;
+
+ if (tctr > targs.tfirst)
+ pr_info("\n");
+
+ pr_info("Testing %s performance with %i runs, %d seconds each\n",
+ data->name, targs.runs, targs.seconds);
+
+ rbestruntime.runtimens = 0;
+ for (runctr = 0; runctr < targs.runs; runctr++)
+ if (!test_loop(data, &targs,
+ &rbestslottime, &rbestruntime))
+ break;
+
+ if (rbestruntime.runtimens)
+ pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n",
+ rbestruntime.iter_runtime.tv_sec,
+ rbestruntime.iter_runtime.tv_nsec,
+ rbestruntime.nloops);
+ }
+
+ if (rbestslottime.slottimens)
+ pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n",
+ rbestslottime.slot_runtime.tv_sec,
+ rbestslottime.slot_runtime.tv_nsec);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
index 9b78e8889638..8c77537af5a1 100644
--- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
@@ -19,7 +19,12 @@ struct {
u32 function;
u32 index;
} mangled_cpuids[] = {
+ /*
+ * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
+ * which are not controlled for by this test.
+ */
{.function = 0xd, .index = 0},
+ {.function = 0xd, .index = 1},
};
static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
index cb953df4d7d0..8aed0db1331d 100644
--- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
+++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
@@ -37,9 +37,7 @@ static void test_get_msr_index(void)
int old_res, res, kvm_fd, r;
struct kvm_msr_list *list;
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
old_res = kvm_num_index_msrs(kvm_fd, 0);
TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
@@ -101,9 +99,7 @@ static void test_get_msr_feature(void)
int res, old_res, i, kvm_fd;
struct kvm_msr_list *feature_list;
- kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
- if (kvm_fd < 0)
- exit(KSFT_SKIP);
+ kvm_fd = open_kvm_dev_path_or_exit();
old_res = kvm_num_feature_msrs(kvm_fd, 0);
TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6b4feb92dc79..6a6bc7af0e28 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -307,6 +307,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
return kvm_make_all_cpus_request_except(kvm, req, NULL);
}
+EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
void kvm_flush_remote_tlbs(struct kvm *kvm)
@@ -2929,6 +2930,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
goto out;
if (signal_pending(current))
goto out;
+ if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
+ goto out;
ret = 0;
out:
@@ -2973,8 +2976,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
goto out;
}
poll_end = cur = ktime_get();
- } while (single_task_running() && !need_resched() &&
- ktime_before(cur, stop));
+ } while (kvm_vcpu_can_poll(cur, stop));
}
prepare_to_rcuwait(&vcpu->wait);
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index c9bb3957f58a..28fda42e471b 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -40,21 +40,17 @@ static int __connect(struct irq_bypass_producer *prod,
if (prod->add_consumer)
ret = prod->add_consumer(prod, cons);
- if (ret)
- goto err_add_consumer;
-
- ret = cons->add_producer(cons, prod);
- if (ret)
- goto err_add_producer;
+ if (!ret) {
+ ret = cons->add_producer(cons, prod);
+ if (ret && prod->del_consumer)
+ prod->del_consumer(prod, cons);
+ }
if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);
-err_add_producer:
- if (prod->del_consumer)
- prod->del_consumer(prod, cons);
-err_add_consumer:
+
return ret;
}